Repository: nebuly-ai/optimate Branch: main Commit: a6d302f912b4 Files: 306 Total size: 1.6 MB Directory structure: gitextract_7q29s3ew/ ├── .gitignore ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── README.md ├── monitoring/ │ └── nebuly/ │ └── __init__.py └── optimization/ ├── .github/ │ └── workflows/ │ └── tests.yml ├── chatllama/ │ ├── LICENSE │ ├── README.md │ ├── artifacts/ │ │ ├── config/ │ │ │ ├── config.yaml │ │ │ ├── ds_config.json │ │ │ └── peft_config.yaml │ │ ├── datasets/ │ │ │ ├── actor_dataset.json │ │ │ ├── reward_dataset.json │ │ │ └── rlhf_dataset.json │ │ ├── download_dataset.py │ │ ├── extend_rlhf_dataset.py │ │ ├── generate_actor_dataset.py │ │ ├── generate_rewards.py │ │ ├── main.py │ │ └── templates.json │ ├── chatllama/ │ │ ├── __init__.py │ │ ├── langchain_modules/ │ │ │ ├── __init__.py │ │ │ └── prompt_templates.py │ │ ├── llama_model.py │ │ └── rlhf/ │ │ ├── __init__.py │ │ ├── actor.py │ │ ├── config.py │ │ ├── dataset.py │ │ ├── model_list.py │ │ ├── model_loader.py │ │ ├── reward.py │ │ ├── trainer.py │ │ └── utils.py │ └── setup.py ├── cloud_surfer/ │ └── README.md ├── forward_forward/ │ ├── README.md │ ├── forward_forward/ │ │ ├── __init__.py │ │ ├── api/ │ │ │ ├── __init__.py │ │ │ └── functions.py │ │ ├── app.py │ │ ├── operations/ │ │ │ ├── __init__.py │ │ │ ├── build_models.py │ │ │ ├── data.py │ │ │ ├── fetch_operations.py │ │ │ └── trainers.py │ │ ├── root_op.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── labels.py │ │ ├── modules.py │ │ └── utils.py │ ├── requirements.txt │ └── setup.py ├── large_speedster/ │ └── README.md ├── nebullvm/ │ ├── .pre-commit-config.yaml │ ├── CONTRIBUTING.md │ ├── Dockerfile │ ├── LICENSE │ ├── MANIFEST.in │ ├── README.md │ ├── azure-pipelines.yml │ ├── docker_build.sh │ ├── docs/ │ │ ├── Makefile │ │ ├── README.md │ │ ├── conf.py │ │ ├── index.rst │ │ ├── modules/ │ │ │ ├── api.rst │ │ │ ├── converters.rst │ │ │ ├── index.rst │ │ │ ├── inference_learners.rst │ │ │ ├── installers.rst │ │ │ └── optimizers.rst │ │ └── requirements-docs.txt │ ├── nebullvm/ │ │ ├── __init__.py │ │ ├── api/ │ │ │ └── __init__.py │ │ ├── apps/ │ │ │ ├── __init__.py │ │ │ └── base.py │ │ ├── config.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── models.py │ │ │ ├── tests/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_models.py │ │ │ └── types.py │ │ ├── installers/ │ │ │ ├── __init__.py │ │ │ ├── auto_installer.py │ │ │ ├── install_bladedisc.sh │ │ │ ├── install_fastertransformer.sh │ │ │ ├── install_tensor_rt.sh │ │ │ ├── install_tvm.sh │ │ │ ├── install_tvm_prerequisites.sh │ │ │ ├── installers.py │ │ │ ├── tests/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_install_frameworks.py │ │ │ └── tvm_installers/ │ │ │ ├── arm/ │ │ │ │ └── config.cmake │ │ │ ├── arm_cuda/ │ │ │ │ └── config.cmake │ │ │ ├── x86/ │ │ │ │ └── config.cmake │ │ │ └── x86_cuda/ │ │ │ └── config.cmake │ │ ├── operations/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── conversions/ │ │ │ │ ├── __init__.py │ │ │ │ ├── converters.py │ │ │ │ ├── huggingface.py │ │ │ │ ├── pytorch.py │ │ │ │ ├── tensorflow.py │ │ │ │ └── utils.py │ │ │ ├── fetch_operations/ │ │ │ │ ├── __init__.py │ │ │ │ └── local.py │ │ │ ├── inference_learners/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── blade_disc.py │ │ │ │ ├── builders.py │ │ │ │ ├── deepsparse.py │ │ │ │ ├── faster_transformer.py │ │ │ │ ├── huggingface.py │ │ │ │ ├── neural_compressor.py │ │ │ │ ├── onnx.py │ │ │ │ ├── openvino.py │ │ │ │ ├── tensor_rt.py │ │ │ │ ├── tensorflow.py │ │ │ │ ├── torch_dynamo.py │ │ │ │ ├── torch_neuron.py │ │ │ │ ├── torch_xla.py │ │ │ │ ├── torchscript.py │ │ │ │ ├── tvm.py │ │ │ │ └── utils.py │ │ │ ├── measures/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── measures.py │ │ │ │ └── utils.py │ │ │ └── optimizations/ │ │ │ ├── __init__.py │ │ │ ├── compilers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── deepsparse.py │ │ │ │ ├── faster_transformer/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── bert/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── checkpoint_quantization.py │ │ │ │ │ │ └── modeling_bert.py │ │ │ │ │ └── gpt/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── utils/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── gpt_decoder.py │ │ │ │ │ └── huggingface_gpt_convert.py │ │ │ │ ├── intel_neural_compressor.py │ │ │ │ ├── onnxruntime.py │ │ │ │ ├── openvino.py │ │ │ │ ├── quantizations/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── intel_neural_compressor.py │ │ │ │ │ ├── onnx.py │ │ │ │ │ ├── openvino.py │ │ │ │ │ ├── pytorch.py │ │ │ │ │ ├── tensor_rt.py │ │ │ │ │ ├── tensorflow.py │ │ │ │ │ ├── tvm.py │ │ │ │ │ └── utils.py │ │ │ │ ├── tensor_rt.py │ │ │ │ ├── tensorflow.py │ │ │ │ ├── torch_dynamo.py │ │ │ │ ├── torch_neuron.py │ │ │ │ ├── torch_xla.py │ │ │ │ ├── torchscript.py │ │ │ │ ├── tvm.py │ │ │ │ └── utils.py │ │ │ ├── compressors/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── intel.py │ │ │ │ ├── scripts/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── neural_magic_training.py │ │ │ │ └── sparseml.py │ │ │ ├── optimize_inference.py │ │ │ ├── optimizers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── optimizers.py │ │ │ ├── tests/ │ │ │ │ ├── __init__.py │ │ │ │ ├── test_deepsparse.py │ │ │ │ ├── test_intel_neural_compressor.py │ │ │ │ ├── test_onnxruntime.py │ │ │ │ ├── test_openvino.py │ │ │ │ ├── test_tensor_rt.py │ │ │ │ ├── test_tensorflow.py │ │ │ │ ├── test_torch_dynamo.py │ │ │ │ ├── test_torchscript.py │ │ │ │ ├── test_tvm.py │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── optional_modules/ │ │ │ ├── __init__.py │ │ │ ├── blade_disc.py │ │ │ ├── deepsparse.py │ │ │ ├── diffusers.py │ │ │ ├── dummy.py │ │ │ ├── huggingface.py │ │ │ ├── neural_compressor.py │ │ │ ├── onnx.py │ │ │ ├── onnxruntime.py │ │ │ ├── onnxsim.py │ │ │ ├── openvino.py │ │ │ ├── tensor_rt.py │ │ │ ├── tensorflow.py │ │ │ ├── torch.py │ │ │ ├── torch_neuron.py │ │ │ ├── torch_tensorrt.py │ │ │ ├── torch_xla.py │ │ │ ├── tvm.py │ │ │ └── utils.py │ │ └── tools/ │ │ ├── __init__.py │ │ ├── adapters.py │ │ ├── benchmark.py │ │ ├── data.py │ │ ├── diffusers.py │ │ ├── feedback_collector.py │ │ ├── hardware_utils.py │ │ ├── huggingface.py │ │ ├── logger.py │ │ ├── onnx.py │ │ ├── pytorch.py │ │ ├── tests/ │ │ │ ├── __init__.py │ │ │ ├── test_data.py │ │ │ ├── test_hardware_utils.py │ │ │ └── test_utils.py │ │ ├── tf.py │ │ ├── transformations.py │ │ ├── utils.py │ │ └── venv.py │ ├── nebullvm.toml │ ├── requirements-dev.txt │ ├── requirements.txt │ └── setup.py ├── open_alpha_tensor/ │ ├── README.md │ ├── config.json │ ├── main.py │ ├── open_alpha_tensor/ │ │ ├── __init__.py │ │ ├── api/ │ │ │ ├── __init__.py │ │ │ └── functions.py │ │ ├── config.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── actors/ │ │ │ │ ├── __init__.py │ │ │ │ └── stage.py │ │ │ ├── data/ │ │ │ │ ├── __init__.py │ │ │ │ ├── basis_change.py │ │ │ │ ├── dataset.py │ │ │ │ ├── generation.py │ │ │ │ └── utils.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── alpha_tensor.py │ │ │ │ ├── attention.py │ │ │ │ ├── extras.py │ │ │ │ ├── heads.py │ │ │ │ └── torso.py │ │ │ └── training.py │ │ ├── operations/ │ │ │ ├── __init__.py │ │ │ ├── checkpoint_op.py │ │ │ ├── model_op.py │ │ │ └── training_op.py │ │ └── root_op.py │ ├── resources/ │ │ └── open_alpha_tensor.md │ └── setup.py ├── optimate/ │ └── README.md └── speedster/ ├── README.md ├── docs/ │ └── en/ │ ├── docs/ │ │ ├── advanced_options.md │ │ ├── benchmarks.md │ │ ├── getting_started/ │ │ │ ├── diffusers_getting_started.md │ │ │ ├── hf_getting_started.md │ │ │ ├── onnx_getting_started.md │ │ │ ├── pytorch_getting_started.md │ │ │ └── tf_getting_started.md │ │ ├── hardware.md │ │ ├── installation.md │ │ ├── key_concepts.md │ │ ├── notebooks.md │ │ ├── overview.md │ │ └── telemetry.md │ └── mkdocs.yaml ├── notebooks/ │ ├── README.md │ ├── diffusers/ │ │ ├── Accelerate_Stable_Diffusion_with_Speedster.ipynb │ │ └── Readme.md │ ├── huggingface/ │ │ ├── Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb │ │ ├── Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb │ │ ├── Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb │ │ ├── Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb │ │ ├── Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb │ │ ├── Readme.md │ │ └── faster_transformer_bert.py │ ├── onnx/ │ │ ├── Accelerate_ONNX_ResNet50_with_Speedster.ipynb │ │ └── Readme.md │ ├── pytorch/ │ │ ├── Accelerate_PyTorch_ResNet50_with_Speedster.ipynb │ │ ├── Accelerate_PyTorch_ViT_with_Speedster.ipynb │ │ ├── Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb │ │ ├── Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb │ │ ├── Accelerate_fast_ai_Resnet34_with_Speedster.ipynb │ │ └── Readme.md │ └── tensorflow/ │ ├── Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb │ └── Readme.md ├── requirements.txt ├── setup.py ├── speedster/ │ ├── __init__.py │ ├── api/ │ │ ├── __init__.py │ │ ├── functions.py │ │ └── tests/ │ │ ├── __init__.py │ │ ├── test_huggingface.py │ │ ├── test_onnx.py │ │ ├── test_pytorch.py │ │ ├── test_tensorflow.py │ │ └── utils.py │ ├── root_op.py │ ├── speedster.py │ ├── tests/ │ │ ├── __init__.py │ │ └── test_root_op.py │ └── utils.py └── speedster.toml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation optimization/nebullvm/docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version .idea # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # MacOS DS_Store .DS_Store # Pickle folder .pkl_memoize_py3 # Folder where optimized models are stored optimized_model # Config file for tests coverage .coveragerc ================================================ FILE: CITATION.cff ================================================ cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - family-names: "Nebuly" given-names: "S.r.l" - family-names: "Fiori" given-names: "Diego" orcid: "https://orcid.org/0000-0003-1910-0565" - family-names: "Sofi" given-names: "Valerio" orcid: "https://orcid.org/0000-0001-5978-897X" title: "nebullvm" version: 0.4.3 date-released: 2022-10-10 url: "https://github.com/nebuly-ai/nebullvm" ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: * Demonstrating empathy and kindness toward other people * Being respectful of differing opinions, viewpoints, and experiences * Giving and gracefully accepting constructive feedback * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience * Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: * The use of sexualized language or imagery, and sexual attention or advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at social@nebuly.ai. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. ================================================ FILE: README.md ================================================ # OptiMate **[Legacy]** This repository is now in a legacy phase and is no longer actively maintained. Although the source code is still available in the Git history, there will be no additional updates or official support. **[About Nebuly]** Our team is fully committed on creating the best user-experience platform for LLMs so that companies can understand user behavior at scale when interacting with their LLM-based products. - To learn more on how to get started, visit our [official documentation](https://docs.nebuly.com/welcome/overview) - If you need enterprise support, please contact us [here](https://www.nebuly.com/nebuly-book-a-demo) **[About optimate]** We have open-sourced a couple of internal projects to the community, but we are not currently maintaining them. Optimate is a collection of libraries designed to help you optimize your AI models. It is an open-source project developed by Nebuly AI but is **not actively maintained**. The tools available to assist you in your optimization are: ✅ [Speedster](https://github.com/nebuly-ai/optimate/tree/main/optimization/speedster): reduce inference costs by leveraging SOTA optimization techniques that best couple your AI models with the underlying hardware (GPUs and CPUs) ✅ [Nos](https://github.com/nebuly-ai/nos): reduce infrastructure costs by leveraging real-time dynamic partitioning and elastic quotas to maximize the utilization of your Kubernetes GPU cluster ✅ [ChatLLaMA](https://github.com/nebuly-ai/optimate/tree/main/optimization/chatllama): reduce hardware and data costs by leveraging fine-tuning optimization techniques and RLHF alignment ================================================ FILE: monitoring/nebuly/__init__.py ================================================ ================================================ FILE: optimization/.github/workflows/tests.yml ================================================ name: Run tests on: push: branches: - "main" paths-ignore: - ".github/**" - "*.md" - "docs/**" - "notebooks/**" pull_request: branches: - "main" paths-ignore: - ".github/**" - "*.md" - "docs/**" - "notebooks/**" jobs: test_on_ubuntu_cpu: runs-on: ubuntu-20.04 strategy: matrix: # Run in all these versions of Python python-version: [ 3.8, 3.9, "3.10" ] steps: # Checkout the latest code from the repo - name: Checkout repo uses: actions/checkout@v2 # Setup which version of Python to use - name: Set Up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} # Display the Python version being used - name: Display Python version run: python -c "import sys; print(sys.version)" # Install nebullvm - name: Install nebullvm run: | python -m pip install --upgrade pip pip install . # Install Speedster - name: Install Speedster run: | cd apps/accelerate/speedster pip install . cd ../../.. # Install PyTorch - name: Install PyTorch run: python -m pip install torch==2.0.0 # Install compilers except tvm - name: Install deep learning compilers run: python -m nebullvm.installers.auto_installer --compilers all # Install requirements for testing - name: Install requirements for testing run: pip install -r "requirements-dev.txt" # Run api tests - name: Run api tests run: | export SPEEDSTER_DISABLE_TELEMETRY=1 cd apps/accelerate/speedster pytest cd ../../.. # Run components tests - name: Run components tests run: | cd nebullvm pytest cd ../ # test_on_windows_cpu: # runs-on: windows-latest # # strategy: # matrix: # # Run in all these versions of Python # python-version: [ 3.8, 3.9, "3.10" ] # # steps: # # Checkout the latest code from the repo # - name: Checkout repo # uses: actions/checkout@v2 # # Setup which version of Python to use # - name: Set Up Python ${{ matrix.python-version }} # uses: actions/setup-python@v2 # with: # python-version: ${{ matrix.python-version }} # # Display the Python version being used # - name: Display Python version # run: python -c "import sys; print(sys.version)" # # Install nebullvm # - name: Install nebullvm # run: | # python -m pip install --upgrade pip # pip install . # # Install Speedster # - name: Install Speedster # run: | # cd apps/accelerate/speedster # pip install . # cd ../../.. # - name: Install PyTorch # run: python -m pip install torch==2.0.0 # # Install compilers except tvm # - name: Install deep learning compilers # run: python -m nebullvm.installers.auto_installer --compilers all # # Install requirements for testing # - name: Install requirements for testing # run: pip install -r "requirements-dev.txt" # # Run api tests # - name: Run api tests # run: | # $env:SPEEDSTER_DISABLE_TELEMETRY=1 # cd apps/accelerate/speedster # pytest # cd ../../.. # # Run components tests # - name: Run components tests # run: | # cd nebullvm # pytest # cd ../ # ================================================ FILE: optimization/chatllama/LICENSE ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: optimization/chatllama/README.md ================================================ # **🦙 ChatLLaMA** > :warning: Please note this library does NOT contain LLaMA’s weights; to access the weights, you need to apply to Meta's form. `ChatLLaMA` 🦙 is a library that allows you to efficiently leverage LLMs fine-tuning capabilities using your own data and the least amount of compute possible. Its purpose is to give developers peace of mind, by abstracting the efforts required for computational optimization and for the collection of large amounts of data. If you like the project, please show your support by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers). ## Quick install You can install the package with pip: ```bash pip install chatllama-py ``` Then you need to install the Llama models cloned from [Meta's repository](https://github.com/facebookresearch/llama): ```bash git clone https://github.com/facebookresearch/llama.git cd llama pip install -r requirements.txt pip install -e . ``` Follow the instructions in the Llama repository to download the model weights and tokenizer. ## What can ChatLLaMA help with? `ChatLLaMA` 🦙 has been designed to help developers with various use cases, all related to RLHF training and optimized inference. These are some of the use cases that better resonate with our community wishlist: - I want to train an efficient ChatGPT-like assistant on my local hardware infrastructure using a limited amount of data; - I want to create my own personalized version of ChatGPT-like assistant without costs getting out of control; - I want to understand which model architecture (LLaMA, OPT, GPTJ, etc.) best fits my requirements in terms of hardware, compute budget, and performance; ## Getting started In this Getting Started we will set up a local RLHF training that will allow you to create your own ChatGPT-like assistant. In this example, we used OPT-1.3B, wherever possible we used open-source datasets and ran the training on a NVIDIA A100. If you want to use other models or hardware, we recommend reading the [supported models](#supported-models), [hardware requirements](#hardware-requirements) and [dataset preparation](#dataset-preparation) sections. In this example, we ran a few epochs of the training; this took a few hours. Any feedback on total training time, on any hardware, would be greatly appreciated. Please share your experience with our community on our Discord channel. To quickly get you started, we will focus on 3 key steps: 1. Download YAML files to customize your training process. Please note that all the parameters of the library can be managed in the [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml); 2. Prepare the 3 datasets needed to train the actor model, the reward model and perform RLHF; 3. Train the models on your local infrastructure.
1 - YAML download First, let’s get the artifacts for running ChatLLaMA. The artifacts contain: - [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml): config file for model and data set. This allows you to 1) select the model you prefer (LLaMA, OPT, BLOOM, etc) 2) change all the hyperparameters of the training process; - [`ds_config.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/ds_config.json): config file to define DeepSpeed training parameters; - [`peft_config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/peft_config.yaml): config file to define PEFT parameters; PEFT is used for efficient training with Hugging Face models. It can be used for setting the LoRA parameters as rank and precision. - [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json): synthetic data generation templates that can be used to personalize the creation of the dataset. The templates are used for feeding LLMs during the data generation. Note that the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file contains a dictionary having as *keys* the training steps (`actor`, `reward`, `rlhf`) and as *values* a string containing the personalization requests of the user. For more details see the [dataset preparation](#dataset-preparation) section; - [`main.py`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/main.py): file to train the model. ```bash wget -O artifacts.zip https://nbllabartifacts.blob.core.windows.net/chatllama/artifacts.zip\?sp\=r\&st\=2023-03-08T14:53:24Z\&se\=2100-03-08T22:53:24Z\&spr\=https\&sv\=2021-06-08\&sr\=b\&sig\=jqr%2B2ZkR0SW9RjV0pDOdQ%2BDulLXLjbZ36vmNd4XxxyQ%3D unzip artifacts.zip ``` Once you have run the command above, you will find the all artificats in the [`artifacts/`](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama/artifacts) directory. Now you can move on to the next section regarding the dataset preparation.
2 - Dataset preparation Before training the model, we need to prepare 3 datasets: - `actor_training_data`: this is the JSON dataset used in the supervised fine-tuning. It consists of examples of unlabelled conversations, e.g. collection of prompts and responses; - `rlhf_training_data`: this is the JSON dataset used for RLHF training. It consists of a collection of possible input user prompts; - `reward_training_data`: this is the JSON dataset used to train the reward model. It consists of responses with associated scores. In this example, we are using only publicly available dataset and synthetic generation; if you want to use your own data instead, please see the [Dataset preparation](#dataset-preparation) section. First, let’s download the `actor_training_data` and the `rlhf_training_data`: ```bash python artifacts/download_dataset.py ARLHF --path ./datasets --number_of_samples 200 ``` Finally, let’s create the `reward_training_data` using `davinci-003` for synthetic data generation. ```bash export OPENAI_API_KEY=YOUR_API_KEY python artifacts/generate_rewards.py ./datasets/reward_training_data.json ``` > :warning: Creating the `reward_training_data` with `davinci-003` is not free, i.e. it costs a few $$. If you prefer avoiding external paid APIs, we suggest using HuggingFace’s models (e.g. flan_t5_xl) as described in more detail in the [Supported models](#supported-models) section. > > :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI". At this point, we have successfully created the 3 datasets. We can therefore move on to the final section and start the training.
3 - Training You can train the 3 models in separate steps: - Train the Reward Model ```bash python artifacts/main.py artifacts/config/config.yaml --type REWARD ``` - Pre-Train the Actor Model ```bash python artifacts/main.py artifacts/config/config.yaml --type ACTOR ``` - Training the Actor with reinforcement learning. ```bash python artifacts/main.py artifacts/config/config.yaml --type RL ``` or, equivantly, the 3 trainings can also be pipelined using the flag ALL. ```bash python artifacts/main.py artifacts/config/config.yaml --type ALL ``` Note that the path to the datasets and the training hyper-parameters of the training process are specified in the [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml) file.
## Contributing and Roadmap As an open source project in a rapidly evolving field, we welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see our [Roadmap page](https://github.com/users/nebuly-ai/projects/1/views/1) for more information on how to get involved. You can participate in the following ways: 1. Submit an issue or PR on GitHub 2. Join our [Discord group](https://discord.gg/77d5kGSa8e) to chat ## Supported models
Actor models We support models that can be run efficiently with a limited amount of compute, such as LLaMA and 🤗 transformers. These are the models with less than 20B parameters currently supported : - LLaMA: 7B and 13B, please note this library does NOT contain LLaMA’s weights; to access the weights, you need to apply to Meta's [form](https://forms.gle/jk851eBVbX1m5TAv5). - GPTJ: 6B - GPTNeoX: 1.3B, 20B - **(⚠️WIP)** Flan-T5: 80M, 259M, 780M, 3B, 11B - OPT: 125M, 359M, 1.3B, 2.7B, 6.7B, 13B - BLOOM: 560M, 1.1B, 1.7B, 3B, 7.1B - BLOOMZ: 560M, 1.1B, 1.7B, 3B, 7.1B - Galactica: 125M, 1.3B, 6.7B
Reward models We suggest using models under 6B from 🤗 transformers: - GPT2: 124M, 355M, 774M, 1.5B - OPT: 125M, 359M, 1.3B, 2.7B - GPTJ: 6B - BLOOMZ: 560M, 1.1B, 1.7B, 3B - **(⚠️WIP)** OpenAssistant [pre-trained reward models](https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large-v2)
Synthetic data generation models We support both APIs from OpenAI and 🤗 transformers: - OpenAI: da-vinci-003, gpt-3.5-turbo **(⚠️WIP)** - HuggingFace: Flan-T5 (3B and 11B) > :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI". :watninh If you need support for different models, please open an issue and we will get to work.
## Hardware requirements
Training Larger actor models require more powerful hardware. Here is a rough hardware recommendation table, suggesting the right type of hardware for different actor model sizes: - 125M to 1.3B → 1x Nvidia 3090/4090 - 1.3B to 3B → 1x Nvidia A100 (80Gb) - 3B with DeepSpeed CPU off-loading → 1x Nvidia 3090/4090 - 3B to 7B with DeepSpeed ZeRO → 4x Nvidia T4 - 3B to 13B → 4x Nvidia A100 (80Gb) - 13B to 20B with DeepSpeed ZeRO → 4x Nvidia A100 (80Gb) - 13B to 20B → 8x Nvidia A100 (80Gb)
Inference **(⚠️WIP)** When it comes to inference optimization, ChatLLaMA will support the following optimization techniques: - [ ] DeepSpeed ZeRO - [ ] FlexGen - [ ] HF Accelerate - [ ] PyTorch Vanilla
Please note that inference optimization has yet to be implemented. If you would like to contribute, please see the **issue roadmap**, community contributions are always welcome 😊. ## Dataset preparation To successfully train a ChatLLaMA assistant, you need 3 different datasets: `actor_training_data`, `rlhf_training_data` and `reward_training_data`.
Dataset for supervised fine-tuning of the actor model The `actor_training_data` is a collection of prompts with the associated responses as highlighted below: ```json [ { "user_input": "here the input of the user", "completion": "here the model completion" } ] ``` ChatLLaMA supports 4 different options to prepare the `actor_training_data`: *
Use 100% synthetic data The dataset can be synthetically generated by running the following command: ```bash python artifacts/generate_actor_dataset.py ``` > :warning: Note that this command will require a subscription to OpenAI. Generating the full dataset with `davinci-003` could cost approximately ~200$. > > :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI". Alternatively, you can generate the dataset for free using 🤗 tranformers as described in the section [Supported models](#supported-models).
*
Use one of the open source datasets with assistant interactions Currently, we support: - [Anthropic HH RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf): this dataset consists of structured question/answer pairs with an LLM chatbot that includes selected and rejected answers; - [Stanford Human Preferences Dataset (SHP)](https://huggingface.co/datasets/stanfordnlp/SHP): this dataset is curated from selected "ask" subreddits, and includes questions that span a wide range of question/answer pairs based on the most upvoted responses. Please note that, unlike HH RLHF, this dataset is not intended to reduce harassment by selecting the ideal chatbot response, but instead weights the most helpful human responses. The datasets can be downloaded running the following command: ```bash python artifacts/download_dataset.py --path --number_of_samples ``` Where: - `` could be "SHP" for the StanfordNLP/SHP dataset or "ARLHF" for the Anthropic/hh-rlhf dataset; - `` is the folder path to where the datasets are going to be created; - `` is the number of samples of which the reward_dataset.json is composed.
*
Use 100% personalized dataset The user provides his own personalized full dataset. Datasets must be JSON files with the following format: ``` [ { "user_input": "here the input of the user", "completion": "here the model completion" } ] ``` Where the list contains multiple dictionaries, and each dictionary corresponds to a data sample. We suggest using more than 1000 data samples to run the actor training.
*
(⚠️WIP) Create the full dataset augmenting few custom data samples The dataset can be generated synthetically from a few prompt+response examples provided by the user (few =>10).
Dataset for RLHF The dataset for RLHF consists just of prompt examples: ```json [ { "user_input": "here the example of user input" } ] ``` It can be provided in 2 different ways: *
Few examples provided by the user and dataset synthetically expanded using LLM You need to add the key `rlhf` to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file with the information about the task you want to perform and extra context needed by the LLM for the generation. Here is an example of template: ```json { "rlhf": "Here is the template for the generating RLHF prompts. The task we want to perform is ..." } ``` *Note that all templates must be saved in a single JSON file named [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json)*
*
The user provides the full dataset with possible interactions with the model The dataset needs to contain more than 1000 prompt examples: ```json [ { "user_input": "here the example of user input" } ] ``` The file must be named `rlhf_training_data.json`.
Dataset to train the reward model The `reward_training_data` is a collection of i) prompts, ii) completion and iii) score of the completion assigned accordingly to the user feedback (the Human Feedback in RLHF). ```json [{ "user_input": "...", "completion": "...", "score": 1 }, ... ] ``` We support 3 different options to prepare the `reward_training_data`: - Fully Synthetic Score Generation In this case the reward dataset can be synthetically scored using a LLM as Human Feedback. We recommend the `reward_training_data` having at least 100 data samples. ```json [{ "user_input": "...", "completion": "...", "score": None }, ... ] ``` A LLM model is used to assign the score to each entry. The LLM needs a prompt template containing all the instructions to evaluate the generated text. To do this, you should add the key `reward` to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file. Here is an example: ```json { "reward": "Here is the template for the reward model. The rules are:\n\n1.Rule 1\n\n2. Rule 2" } ``` If no template is provided the default one is used. You can find the default template in `artifacts/generate_rewards.py`. Note that all templates must be saved in a single JSON file named [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json). Once you have the unlabelled dataset, you can generate the scores by running the following command: ```bash python artifacts/generate_rewards.py --model --temperature --max_tokens --reward_template ``` Where: - `` path to the reward dataset to be scored; - `` model to use for the reward. Default and suggested text-davinci-003 (More to come); - `` temperature used to score the model; temperature=0.1; - `` max_tokens of the generation; - `` is the path to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file containing the template to be used for generating the reward. If no path is provided, the default template will be used. - The user provides their personalized full dataset Datasets must be JSON files in the following format: ```json [ { "user_input": "here type the user input", "completion": "here type the completion", "score": 4.0 }, { "user_input": "here type the user input", "completion": "random garbage", "score": 0.0 } ] ``` Note that at least 100 data samples are required in this case. The file must be named `reward_training_data.json` - **(⚠️WIP)** Few examples provided by the user and dataset synthetically expanded using LLM
# License See the [LICENSE](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/LICENSE) file. ================================================ FILE: optimization/chatllama/artifacts/config/config.yaml ================================================ --- trainer_config: # learning rates actor_lr: 0.000005 critic_lr: 0.000009 # PPO Hyperparameters actor_eps_clip: 0.2 critic_eps_clip: 0.2 beta_s: 0.02 # coefficient for the discounted rewards gamma_discounted: 1 # path to examples to be sampled (training dataset) see rlhf_dataset.json examples_path: "./datasets/rlhf_training_data.json" # number of episodes and generation performed for each episode # in the train() method num_episodes: 100 max_timesteps: 32 # number of timesteps after which the learn() method is called # (to update the weights) update_timesteps: 32 # number of example sampled at each timestep num_examples: 1 # batch and epochs for the training batch_size: 1 epochs: 1 # number of episodes after which update the checkpoints in RL training checkpoint_steps: 1000 # here specify the name of the actor_rl checkpoint from which resume # during actor RL training. If null load the last one. checkpoint_name: null actor_config: model: "facebook/opt-1.3b" model_folder: "./models" tokenizer_path: "path-to-tokenizer" train_dataset_path: "./datasets/actor_training_data.json" validation_dataset_path: null # froze model embedding during training froze_embeddings: True # use fairscale layers to build the model instead of vanilla pytorch # only for llama use_fairscale: False # max sequence length for the actor (i.e. prompt + completion) it depends on # the model used. max_sequence_length: 2048 # max tokens generated by the actor (completion only) max_tokens: 2048 # minimum number of tokens generated by the actor min_tokens: 100 # additional prompt tokens to be used for template or as safety additonal_prompt_tokens: 20 # temperature for the actor temperature: 0.1 batch_size: 2 # number iteration after print iteration_per_print: 1 lr: 0.000009 epochs: 1 # number of backpropagation after saving the checkpoints checkpoint_steps: 5000 # number of checkpoints to keep while removing the older # (keep memory consumption of checkpoints reasonable) n_checkpoints_to_keep: 5 # here specify the name of the actor checkpoint from which resume # during actor training. If null load the last one. checkpoint_name: null # deepspeed settings deepspeed_enable: False deepspeed_config_path: "./artifacts/config/ds_config.json" # accelerate settings accelerate_enable: False # use_peft - the parameters of PEFT can be modified in the peft_config.yaml peft_enable: False peft_config_path: "./artifacts/config/peft_config.yaml" reward_config: # model to be chosen are gp2-large, bart-base, longformer-base-4096 # more can be simply added in the reward.py __init__() model: "facebook/opt-125m" model_folder: "./models" # hidden size of the additional ffw head to produce the scores model_head_hidden_size: 2048 max_sequence_length: 2048 train_dataset_path: "./datasets/reward_training_data.json" validation_dataset_path: null batch_size: 8 epochs: 1 iteration_per_print: 1 # steps after which the checkpoint are saved checkpoint_steps: 10000 # here specify the name of the reward checkpoint from which resume # during reward training. If null load the last one. checkpoint_name: null lr: 0.000009 # deepspeed settings deepspeed_enable: False deepspeed_config_path: "./artifacts/config/ds_config.json" # accelerate settings accelerate_enable: False critic_config: # model to be chosen are gp2-large, bart-base, longformer-base-4096 # more can be simply added in the reward.py __init__() model: "facebook/opt-125m" # hidden size of the additional ffw head to produce the scores model_head_hidden_size: 2048 max_sequence_length: 2048 model_folder: "./models" # here specify the name of the critic checkpoint from which resume # during critic training. If null load the last one. checkpoint_name: null ================================================ FILE: optimization/chatllama/artifacts/config/ds_config.json ================================================ { "train_batch_size": 8, "gradient_accumulation_steps": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "fp16": { "enabled": false, "auto_cast": false, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "zero_optimization": { "stage": 2, "allgather_partitions": true, "allgather_bucket_size": 5e8, "overlap_comm": false, "reduce_scatter": true, "reduce_bucket_size": 5e8, "contiguous_gradients" : true, "offload_param": { "device": "cpu", "nvme_path": "/local_nvme", "pin_memory": true, "buffer_count": 5, "buffer_size": 1e8, "max_in_cpu": 1e9 }, "offload_optimizer": { "device": "cpu", "nvme_path": "/local_nvme", "pin_memory": true, "buffer_count": 4, "fast_init": false }, "stage3_max_live_parameters" : 1e9, "stage3_max_reuse_distance" : 1e9, "stage3_prefetch_bucket_size" : 5e8, "stage3_param_persistence_threshold" : 1e6, "sub_group_size" : 1e12, "elastic_checkpoint" : true, "stage3_gather_16bit_weights_on_model_save": true, "ignore_unused_parameters": true, "round_robin_gradients": true } } ================================================ FILE: optimization/chatllama/artifacts/config/peft_config.yaml ================================================ --- inference_mode: False r: 8 lora_alpha: 32 lora_dropout: 0.1 ================================================ FILE: optimization/chatllama/artifacts/datasets/actor_dataset.json ================================================ [ { "user_input": "here the input of the user", "completion": "here the model completion" } ] ================================================ FILE: optimization/chatllama/artifacts/datasets/reward_dataset.json ================================================ [ { "user_input": "here type the user input", "completion": "here type the completion", "score": 4.0 }, { "user_input": "here type the user input", "completion": "if score is null, it can be evaluated by davinci using reward_trainer.distill()", "score": null } ] ================================================ FILE: optimization/chatllama/artifacts/datasets/rlhf_dataset.json ================================================ [ { "user_input": "here the example of user input" } ] ================================================ FILE: optimization/chatllama/artifacts/download_dataset.py ================================================ import argparse import os from chatllama.rlhf.dataset import AnthropicRLHF, StanfordNLPSHPDataset if __name__ == "__main__": # Setup argument parser parser = argparse.ArgumentParser( prog="generate_rewards.py", description="Generate rewards using LangChain and LLMs", ) parser.add_argument( "dataset_name", help="dataset name it can be. SSHP: stanfordnlp/SHP or ", choices=["SHP", "ARLHF"], ) parser.add_argument( "-p", "--path", help="Specify the path for the dataset", default="./datasets", ) parser.add_argument( "-n", "--number_of_samples", help="Specify the number of samples for the reward dataset", default=200, ) args = parser.parse_args() if os.path.exists(args.path) is False: os.mkdir(args.path) try: n_samples = int(args.number_of_samples) except ValueError: raise ValueError("Number of samples should be an integer") if args.dataset_name == "SHP": dataset = StanfordNLPSHPDataset() dataset.save_dataset(args.path, n_samples) elif args.dataset_name == "ARLHF": dataset = AnthropicRLHF() dataset.save_dataset( args.path, n_samples, ) ================================================ FILE: optimization/chatllama/artifacts/extend_rlhf_dataset.py ================================================ import os.path import numpy as np from langchain import OpenAI, LLMChain, PromptTemplate from transformers import AutoTokenizer, AutoModelForSeq2SeqLM def _get_template_and_variables(prompt: str, with_examples: bool): if with_examples: template = prompt + "\n\nExample: {example}" variables = ["example"] else: template = prompt variables = [] return template, variables def use_langchain_model( user_prompt: str, model_name: str, temperature: float = 0.7, max_tokens: int = 2048, with_examples: bool = False, ) -> LLMChain: llm = OpenAI( model_name=model_name, temperature=temperature, max_tokens=max_tokens ) template, input_variables = _get_template_and_variables( user_prompt, with_examples=with_examples ) prompt_template = PromptTemplate( template=template, input_variables=input_variables, ) return LLMChain(llm=llm, prompt=prompt_template) class HuggingFaceChain: def __init__( self, model_name: str, user_prompt: str, with_examples: bool = False ): self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.prompt, self.input_variables = _get_template_and_variables( user_prompt, with_examples=with_examples ) def run(self, **kwargs): prompt = self.prompt.format(**kwargs) input_ids = self.tokenizer.encode(prompt, return_tensors="pt") output = self.model.generate( input_ids, max_length=100, num_beams=5, early_stopping=True ) return self.tokenizer.decode(output[0], skip_special_tokens=True) def use_huggingface_model( user_prompt: str, model_name: str, with_examples: bool = False, ) -> HuggingFaceChain: return HuggingFaceChain( model_name, user_prompt, with_examples=with_examples ) def main(): import json from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument( "--model", type=str, help="Model name.", default="google/flan-t5-xl", ) parser.add_argument("--templates", type=str, help="Path to templates.") parser.add_argument("--num_prompts", type=int, default=1000) parser.add_argument( "--data_dir", type=str, help="Path where data are stored" ) args = parser.parse_args() model_name = args.model templates_path = args.templates data_dir = args.data_dir with open(os.path.join(data_dir, "rlhf_training_data.json"), "r") as f: examples = json.load(f) with open(templates_path, "r") as f: templates = json.load(f) user_prompt = templates.get("rlhf") if user_prompt is None: raise ValueError("No rlhs template found.") if "davinci" in model_name: chain = use_langchain_model( user_prompt, model_name, with_examples=True ) else: if "t5" not in model_name: raise ValueError("Only Flan-t5 models are supported for HF.") chain = use_huggingface_model( user_prompt, model_name, with_examples=True ) for i in range(args.num_prompts): example = np.random.choice(examples) new_example = chain.run(example=example["user_input"]) example_dict = {"user_input": new_example} examples.append(example_dict) with open(os.path.join(data_dir, "rlhf_training_data.json"), "w") as f: json.dump(examples, f) if __name__ == "__main__": main() ================================================ FILE: optimization/chatllama/artifacts/generate_actor_dataset.py ================================================ from langchain import OpenAI, LLMChain, PromptTemplate from langchain.chains.conversation.memory import ( ConversationBufferWindowMemory, ) from chatllama.langchain_modules.prompt_templates import ( PERSON_CHATBOT_TEMPLATE, AI_CHATBOT_TEMPLATE, ) CONVERSATION_LENGTH = 20 def create_conversation(human_agent: LLMChain, bot_agent: LLMChain): conversation = [] chatbot_output = "" for i in range(CONVERSATION_LENGTH): # Human agent goes first human_output = human_agent.run(chatbot_input=chatbot_output) conversation.append(f"Human: {human_output}") chatbot_output = bot_agent.run(human_input=human_output) conversation.append(f"AI: {chatbot_output}") return "\n".join(conversation) def build_agents(): # be aware that too long completions will not fit the sequence length # of possible critic or reward models ... llm = OpenAI(max_tokens=2048, temperature=0.7) human_template = PromptTemplate(**PERSON_CHATBOT_TEMPLATE) human_agent = LLMChain( llm=llm, prompt=human_template, memory=ConversationBufferWindowMemory(k=4), ) bot_template = PromptTemplate(**AI_CHATBOT_TEMPLATE) bot_agent = LLMChain( llm=llm, prompt=bot_template, memory=ConversationBufferWindowMemory(k=4), ) return human_agent, bot_agent def get_sub_conversations(conversation: str, system_prompt: str): interactions = conversation.split("AI:") sub_conversations = [] for i in range(len(interactions) - 1): user_input = system_prompt + "AI:".join(interactions[: i + 1]) completion = interactions[i + 1].split("Human:")[0].strip() sub_conversations.append( {"user_input": user_input, "completion": completion} ) return sub_conversations def main(): import json import os from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--num_conversations", type=int, default=1000) parser.add_argument("--output_dir", type=str, default="conversations") parser.add_argument("--templates", type=str, default=None) args = parser.parse_args() if args.templates is not None: with open(args.templates, "r") as f: templates = json.load(f) template = templates["actor"] else: template = "" if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for conv in range(args.num_conversations): human_agent, bot_agent = build_agents() conversation = create_conversation(human_agent, bot_agent) with open( os.path.join(args.output_dir, f"conversation_{conv}.txt"), "w" ) as f: f.write(conversation) # convert the conversations to a single json file data = [] for conv in range(args.num_conversations): with open( os.path.join(args.output_dir, f"conversation_{conv}.txt"), "r" ) as f: conversation = f.read() sub_conversations = get_sub_conversations(conversation, template) data.extend(sub_conversations) with open( os.path.join(args.output_dir, "actor_training_data.json"), "w" ) as f: json.dump(data, f) if __name__ == "__main__": main() ================================================ FILE: optimization/chatllama/artifacts/generate_rewards.py ================================================ import argparse import json from langchain import OpenAI, LLMChain, PromptTemplate class ScoreGenerator: def __init__( self, llm_model: str, llm_temperature: float, llm_max_tokens: int, reward_template: dict, ) -> None: self.llm_max_tokens = llm_max_tokens self.llm_temperature = llm_temperature self.llm_model = llm_model # initialize LLM and LangChain openai_llm = OpenAI( model_name=llm_model, temperature=llm_temperature, max_tokens=llm_max_tokens, ) # Customaize your own Reward template by changing the # prompt_template prompt_template = PromptTemplate(**reward_template) print(prompt_template) self.llm = LLMChain(llm=openai_llm, prompt=prompt_template) def distill( self, dataset_path: str, ) -> None: """Parse the dataset and assign scores using LLMs then save back the dataset with the uploaded scores """ print("Assigning scores to the reward dataset...") # load the dataset with open(dataset_path, "r") as f: train_data = json.load(f) # for each element of the dataset, assing a score. for i, data in enumerate(train_data): if data.get("score", None) is None: user_input = data["user_input"] completion = data["completion"] print( f"#### Data {i}:\n" f"#### User_input:\n {user_input}\n" f"#### Completion:\n {completion}\n" ) prompt_tokens = ( data["user_input"] + data["completion"] + self.llm.prompt.template ) prompt_len = int(len(prompt_tokens.split(" ")) / 0.75) # 80% of the max length as safety margin if prompt_len > self.llm_max_tokens * 0.8: print( f"The prompt of the data {i} is too long\n" f"tokens: {prompt_len}\n" f"max_tokens: {self.llm_max_tokens * 0.8}" ) continue score = self.llm.run( user_input=data["user_input"], completion=data["completion"], ).strip() # TODO: extract from score the float value with a regex try: score = float(score) except Exception: print( f"The score returned by the LLM for the" f"data, {i}, is not a float float:\n{score}" ) continue data["score"] = score print(f"### Score: {score} \n\n") # remove all the data that have no score train_data = [data for data in train_data if data.get("score", None)] # save the dataset back print("Writing the updated dataset back to disk ... ") with open(dataset_path, "w") as f: json.dump(train_data, f) print("Score Assignment Completed") if __name__ == "__main__": REWARD_TEMPLATE = dict( template=( "You have to evaluate the following chat with a score" "between 0 and 5" "You MUST evaluate: text quality, content quality and" "coherence.\n" "You MUST return only the number that represents your" "judgment.\n" "The input of the user is: {user_input}\n" "The output of the chatbot is: {completion}\n" "The score is:\n" ), input_variables=["user_input", "completion"], ) # Setup argument parser parser = argparse.ArgumentParser( prog="generate_rewards.py", description="Generate rewards using LangChain and LLMs", ) parser.add_argument("dataset_path", help="Path to the dataset") parser.add_argument( "-m", "--model", help="Specify the model to be used", default="text-davinci-003", ) parser.add_argument( "-t", "--temperature", help="Specify the temperature of the score assignment", default=0.5, ) parser.add_argument( "-k", "--max_tokens", help="Specify the max tokens of the score assignement", default=2048, ) parser.add_argument( "-r", "--reward_template", help="Specify the reward template to be used", default=None, ) # parse arguments args = parser.parse_args() if args.reward_template: templates = json.loads(args.reward_template) if templates.get("reward", None) is None: rw_template = REWARD_TEMPLATE else: rw_template = templates["reward"] else: rw_template = REWARD_TEMPLATE score_generator = ScoreGenerator( args.model, args.temperature, args.max_tokens, rw_template ) score_generator.distill(args.dataset_path) ================================================ FILE: optimization/chatllama/artifacts/main.py ================================================ import argparse from chatllama.rlhf.actor import ActorTrainer from chatllama.rlhf.config import Config from chatllama.rlhf.dataset import BaseDataset from chatllama.rlhf.reward import RewardTrainer from chatllama.rlhf.trainer import RLTrainer # Setup argument parser parser = argparse.ArgumentParser( prog="main.py", description="RLHF Training of ChatBots" ) parser.add_argument("configfile", help="Path to config.yaml file") parser.add_argument( "-t", "--type", help=( "Specify the training type. RL: Training of the model using RL." "ACTOR: Training of the actor model. " "REWARD: Training of the reward model." "RL: The whole pipeline with the three training steps" ), default="ALL", choices=["ALL", "RL", "ACTOR", "REWARD"], ) parser.add_argument( "-a", "--actor", help="Specify actor model by name", default=None ) parser.add_argument( "-r", "--reward", help="Specify reward model by name", default=None ) parser.add_argument("--local_rank", help="Local rank parameter for deepspeed", default=None) # parse arguments args = parser.parse_args() # load config.yaml with all the project informations config = Config(args.configfile) # overwrite config if specified differently if args.actor is not None: config.actor.model = args.actor if args.reward is not None: config.reward.model = args.reward # perform the desired training if args.type == "RL": max_seq = min( config.actor.max_sequence_length, config.reward.max_sequence_length, config.critic.max_sequence_length, ) config.actor.max_sequence_length = max_seq BaseDataset.clean_dataset(config) rlhf_trainer = RLTrainer(config) rlhf_trainer.train() elif args.type == "ACTOR": BaseDataset.clean_dataset(config.actor) actor_trainer = ActorTrainer(config.actor) actor_trainer.train() elif args.type == "REWARD": BaseDataset.clean_dataset(config.reward) reward_trainer = RewardTrainer(config.reward) reward_trainer.train() elif args.type == "ALL": reward_trainer = RewardTrainer(config.reward) reward_trainer.train() actor_trainer = ActorTrainer(config.actor) actor_trainer.train() rlhf_trainer = RLTrainer(config) rlhf_trainer.train() ================================================ FILE: optimization/chatllama/artifacts/templates.json ================================================ { "rlhf": "You are an AI assistant used to generate possible prompts instructions for a chatbot, here is an example of conversation." } ================================================ FILE: optimization/chatllama/chatllama/__init__.py ================================================ ================================================ FILE: optimization/chatllama/chatllama/langchain_modules/__init__.py ================================================ ================================================ FILE: optimization/chatllama/chatllama/langchain_modules/prompt_templates.py ================================================ REWARD_TEMPLATE = dict( template=( "You have to evaluate the following chat with a score between 0 and 5" "You MUST evaluate: text quality, content quality and" "coherence.\n" "You MUST return only the number that represents your" "judgment.\n" "The assignement is:\n{user_input}\n" "The completion is:\n{completion}\n" ), input_variables=["user_input", "completion"], ) AI_CHATBOT_TEMPLATE = dict( template=( "Assistant is a large language model trained by Meta and Nebuly.ai\n" "Assistant is designed to be able to assist with a wide range of " "tasks, from answering simple questions to providing in-depth " "explanations and discussions on a wide range of topics. As a " "language model, Assistant is able to generate human-like text " "based on the input it receives, allowing it to engage in " "natural-sounding conversations and provide responses that are " "coherent and relevant to the topic at hand.\n\n" "Assistant is constantly learning and improving, and its capabilities " "are constantly evolving. It is able to process and understand large " "amounts of text, and can use this knowledge to provide accurate and " "informative responses to a wide range of questions. Additionally, " "Assistant is able to generate its own text based on the input it " "receives, allowing it to engage in discussions and provide " "explanations and descriptions on a wide range of topics.\n\n" "Overall, Assistant is a powerful tool that can help with a wide " "range of tasks and provide valuable insights and information on a " "wide range of topics. Whether you need help with a specific " "question or just want to have a conversation about a particular " "topic, Assistant is here to assist.\n\n{history}\n\n" "Human: {human_input}\n" "Assistant:" ), input_variables=["history", "human_input"], ) PERSON_CHATBOT_TEMPLATE = dict( template=( "You are a human chatting with a chatbot. The chatbot is a large " "language model trained by Meta and Nebuly-ai\n" "The chatbot is designed to be able to assist you with a wide range " "of tasks, from answering simple questions to providing in-depth " "explanations and discussions on a wide range of topics. You are a " "human and you are testing the chatbot. Ask the chatbot questions and" "see how it responds. You can also ask the chatbot to tell you a " "story." "\n\n{history}\n\n" "Chatbot: {chatbot_input}\n" "Human:" ), input_variables=["history", "chatbot_input"], ) ================================================ FILE: optimization/chatllama/chatllama/llama_model.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms # of the GNU General Public License version 3. import json import math import os from dataclasses import dataclass from pathlib import Path from typing import Tuple, List, Union, Optional import deepspeed import torch import torch.distributed import torch.nn.functional as F import fairscale.nn.model_parallel.initialize as fs_init from fairscale.nn.model_parallel.initialize import initialize_model_parallel from fairscale.nn.model_parallel.layers import ( ParallelEmbedding, RowParallelLinear, ColumnParallelLinear, ) from torch import nn from transformers import AutoTokenizer from llama import Tokenizer from llama.generation import sample_top_p class MyTokenizer: """Masked tokenizer of hugging face to be similar to the one of meta, just used for testing purposes. """ def __init__(self, model_path: Optional[str] = None): if model_path is None: self.sp_model = AutoTokenizer.from_pretrained("gpt2") else: self.sp_model = AutoTokenizer.from_pretrained(model_path) self.n_words = self.sp_model.vocab_size self.bos_id = self.sp_model.bos_token_id self.eos_id = self.sp_model.eos_token_id self.pad_id = self.sp_model.eos_token_id def encode( self, s: str, bos: bool = True, eos: bool = True, truncation: bool = True, ) -> List[int]: output = self.sp_model.encode(s, truncation=truncation) t = list(output) if bos: t = [self.bos_id] + t if eos: t = t + [self.eos_id] return t def decode(self, t: List[int]) -> str: input = torch.as_tensor(t) output = self.sp_model.decode(input) return output class HFLikeTokenizer: def __init__(self, tokenizer: Tokenizer): self.tokenizer = tokenizer # assign attributes from real tokenizer to masked one self.pad_id = self.tokenizer.pad_id self.eos_id = self.tokenizer.eos_id self.bos_id = self.tokenizer.bos_id # mask attribute to be similar to hugging face self.eos_token_id = self.tokenizer.eos_id self.pad_token_id = self.tokenizer.pad_id # to match hugging face attribute self.pad_token_id = self.pad_id def create_sequence_mask(self, tokens: torch.Tensor) -> torch.Tensor: mask = torch.where( tokens == self.tokenizer.pad_id, torch.zeros_like(tokens), torch.ones_like(tokens), ) mask = torch.where( tokens == self.tokenizer.bos_id, torch.zeros_like(tokens), mask ) mask = torch.where( tokens == self.tokenizer.eos_id, torch.zeros_like(tokens), mask ) return mask def __call__(self, texts: Union[List[str], str], *args, **kwargs): if isinstance(texts, str): text = self.tokenizer.encode(texts, bos=True, eos=True) tokens = torch.tensor(text).long() mask = torch.ones_like(tokens) else: texts = [ self.tokenizer.encode(text, bos=True, eos=True) for text in texts ] max_len = max(len(text) for text in texts) tokens = torch.full( (len(texts), max_len), self.tokenizer.pad_id ).long() for i, text in enumerate(texts): tokens[i, -len(text) :] = torch.tensor( # noqa E203 text ).long() # TODO: decide how eos and bos should be handled - i need to mask # them? or not? mask = self.create_sequence_mask(tokens) for i in range(tokens.shape[0]): current_tokens = tokens[i, mask[i] == 1] tokens[ i, -len(current_tokens) - 1 : -1 # noqa E203 ] = current_tokens mask = self.create_sequence_mask(tokens) # convert `pad_id` from -1 to 0, otherwise embedding will cause out # of bounds. tokens = torch.where( tokens == self.tokenizer.pad_id, torch.zeros_like(tokens), tokens, ) output = { "input_ids": tokens, "attention_mask": mask, } return output def decode(self, tokens): return self.tokenizer.decode(tokens) @dataclass class ModelArgs: """This class is a modification of the ModelArgs class implemented in the LLaMA repo. The class has been modified for training, since the original one just supports inference. """ dim: int = 512 n_layers: int = 8 n_heads: int = 8 # defined later by tokenizer vocab_size: int = -1 # make SwiGLU hidden layer size multiple of large power of 2 multiple_of: int = 256 norm_eps: float = 1e-5 max_batch_size: int = 32 max_seq_len: int = 1024 # added attributes froze_embeddings: bool = True use_fairscale: bool = True class RMSNorm(torch.nn.Module): """This class is a modification of the RMSNorm class implemented in the LLaMA repo. The class has been modified for training, since the original one just supports inference. """ def __init__(self, dim: int, eps: float = 1e-6): super().__init__() self.eps = eps self.weight = nn.Parameter(torch.ones(dim)) def _norm(self, x): return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) def forward(self, x): output = self._norm(x.float()).type_as(x) return output * self.weight def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): freqs = 1.0 / ( theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim) ) t = torch.arange(end, device=freqs.device) # type: ignore freqs = torch.outer(t, freqs).float() # type: ignore freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 return freqs_cis def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): ndim = x.ndim assert 0 <= 1 < ndim assert freqs_cis.shape == (x.shape[1], x.shape[-1]) shape = [ d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape) ] return freqs_cis.view(*shape) def apply_rotary_emb( xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) freqs_cis = reshape_for_broadcast(freqs_cis, xq_) xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) return xq_out.type_as(xq), xk_out.type_as(xk) class Attention(nn.Module): """This class is a modification of the Attention class implemented in the LLaMA repo. The class has been modified for training, since the original one just supports inference. """ def __init__(self, args: ModelArgs): super().__init__() if args.use_fairscale: self.n_local_heads = ( args.n_heads // fs_init.get_model_parallel_world_size() ) else: self.n_local_heads = args.n_heads self.head_dim = args.dim // args.n_heads if args.use_fairscale: self.wq = ColumnParallelLinear( args.dim, args.n_heads * self.head_dim, bias=False, gather_output=False, init_method=lambda x: x, ) self.wk = ColumnParallelLinear( args.dim, args.n_heads * self.head_dim, bias=False, gather_output=False, init_method=lambda x: x, ) self.wv = ColumnParallelLinear( args.dim, args.n_heads * self.head_dim, bias=False, gather_output=False, init_method=lambda x: x, ) self.wo = RowParallelLinear( args.n_heads * self.head_dim, args.dim, bias=False, input_is_parallel=True, init_method=lambda x: x, ) else: self.wq = nn.Linear( args.dim, args.n_heads * self.head_dim, bias=False ) self.wk = nn.Linear( args.dim, args.n_heads * self.head_dim, bias=False ) self.wv = nn.Linear( args.dim, args.n_heads * self.head_dim, bias=False ) self.wo = nn.Linear( args.n_heads * self.head_dim, args.dim, bias=False ) self.dim_cache = ( args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim, ) self.cache_k = torch.zeros(self.dim_cache).cuda() self.cache_v = torch.zeros(self.dim_cache).cuda() def forward( self, x: torch.Tensor, kv_mask: torch.Tensor, freqs_cis: torch.Tensor, cache_k: Optional[torch.Tensor] = None, cache_v: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: start_pos = 0 # Temporary bsz, seqlen, _ = x.shape xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim) xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim) xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) # Modified code to allow training, caching is not good for training if (cache_k is None and cache_v is not None) or ( cache_k is not None and cache_v is None ): raise ValueError("cache_k is None while cache_v is not None") if cache_k is None: keys = xk values = xv else: cache_k.to(xk.device) cache_v.to(xv.device) cache_k[:bsz, start_pos : start_pos + seqlen] = xk # noqa E203 cache_v[:bsz, start_pos : start_pos + seqlen] = xv # noqa E203 keys = self.cache_k[:bsz, : start_pos + seqlen] # noqa E203 values = self.cache_v[:bsz, : start_pos + seqlen] # noqa E203 xq = xq.transpose(1, 2) keys = keys.transpose(1, 2) values = values.transpose(1, 2) scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt( self.head_dim ) if kv_mask is not None: scores = scores + kv_mask scores = F.softmax(scores.float(), dim=-1).type_as(xq) output = torch.matmul(scores, values) output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) if cache_k is None: return self.wo(output), None, None else: return self.wo(output), self.cache_k, self.cache_v class FeedForward(nn.Module): """This class is a modification of the FeedForward class implemented in the LLaMA repo. The class has been modified for training, since the original one just supports inference. """ def __init__( self, dim: int, hidden_dim: int, multiple_of: int, use_fairscale: bool ): super().__init__() hidden_dim = int(2 * hidden_dim / 3) hidden_dim = multiple_of * ( (hidden_dim + multiple_of - 1) // multiple_of ) if use_fairscale: self.w1 = ColumnParallelLinear( dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x, ) self.w2 = RowParallelLinear( hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x, ) self.w3 = ColumnParallelLinear( dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x, ) else: self.w1 = nn.Linear(dim, hidden_dim, bias=False) self.w2 = nn.Linear(hidden_dim, dim, bias=False) self.w3 = nn.Linear(dim, hidden_dim, bias=False) def forward(self, x): return self.w2(F.silu(self.w1(x)) * self.w3(x)) class TransformerBlock(nn.Module): """This class is a modification of the TransformerBlock class implemented in the LLaMA repo. The class has been modified for training, since the original one just supports inference. """ def __init__(self, layer_id: int, args: ModelArgs): super().__init__() self.n_heads = args.n_heads self.dim = args.dim self.head_dim = args.dim // args.n_heads self.attention = Attention(args) self.feed_forward = FeedForward( dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of, use_fairscale=args.use_fairscale, ) self.layer_id = layer_id self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) self.use_fairscale = args.use_fairscale def forward( self, x: torch.Tensor, attention_mask: torch.Tensor, freqs_cis: torch.Tensor, cache_k: Optional[torch.Tensor] = None, cache_v: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: # modified from orignal code to enable external cache attention_mask = attention_mask[:, None, :, :] if self.use_fairscale: attention_mask = attention_mask.expand( -1, self.n_heads // fs_init.get_model_parallel_world_size(), -1, -1, ) else: attention_mask = attention_mask.expand(-1, self.n_heads, -1, -1) attn, cache_k, cache_v = self.attention.forward( self.attention_norm(x), attention_mask, freqs_cis, cache_k, cache_v ) h = x + attn out = h + self.feed_forward.forward(self.ffn_norm(h)) return out, cache_k, cache_v class Transformer(nn.Module): """This class is a modification of the Transformer class implemented in the LLaMA repo. The class has been modified for training, since the original one just supports inference. The generate method was inspired by the generate function you can find in `llama.generation`. """ def __init__(self, params: ModelArgs): super().__init__() self.params = params self.vocab_size = params.vocab_size self.n_layers = params.n_layers if params.use_fairscale: self.n_local_heads = ( params.n_heads // fs_init.get_model_parallel_world_size() ) else: self.n_local_heads = params.n_heads self.head_dim = params.dim // params.n_heads dim = ( params.max_batch_size, params.max_seq_len, self.n_local_heads, self.head_dim, ) self.cache_k = [torch.zeros(dim) for _ in range(self.n_layers)] self.cache_v = [torch.zeros(dim) for _ in range(self.n_layers)] if params.use_fairscale: self.tok_embeddings = ParallelEmbedding( params.vocab_size, params.dim, init_method=lambda x: x ) else: self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) if params.froze_embeddings: for param in self.tok_embeddings.parameters(): param.requires_grad = False self.layers = torch.nn.ModuleList() for layer_id in range(params.n_layers): self.layers.append(TransformerBlock(layer_id, params)) self.norm = RMSNorm(params.dim, eps=params.norm_eps) if params.use_fairscale: self.output = ColumnParallelLinear( params.dim, params.vocab_size, bias=False, init_method=lambda x: x, ) else: self.output = nn.Linear(params.dim, params.vocab_size, bias=False) # TODO: How too modify this for training? self.freqs_cis = precompute_freqs_cis( self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 ) def forward( self, tokens: torch.Tensor, attention_mask: torch.Tensor ) -> torch.Tensor: attention_mask = attention_mask.detach() logits = self._forward(tokens, attention_mask) return logits def _forward( self, tokens: torch.Tensor, attention_mask: torch.Tensor ) -> torch.Tensor: _bsz, seqlen = tokens.shape h = self.tok_embeddings(tokens) self.freqs_cis = self.freqs_cis.to(h.device) # TEMPORARY FIX, need to understand how to manage the positioning # embedding and the batch size with the current padding and masking. start_pos = 1 freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] # noqa E203 # mask has size (bsz, seqlen). It should be transformed in # (bsz, seqlen, seqlen) # if the mask is a boolean tensor, convert it to int if attention_mask.dtype == torch.bool: attention_mask = attention_mask.long() kv_mask = attention_mask[:, None, :].expand(_bsz, seqlen, seqlen) kv_mask = torch.tril(kv_mask, diagonal=0) kv_mask = 1 - kv_mask kv_mask = ( torch.where( kv_mask == 1, kv_mask.new_tensor(-9223372036854775808), kv_mask ) .detach() .long() ) for i, layer in enumerate(self.layers): if not self.training: cache_k = self.cache_k[i] cache_v = self.cache_v[i] h, cache_k, cache_v = layer( h, kv_mask, freqs_cis, cache_k, cache_v ) else: h, _, _ = layer(h, kv_mask, freqs_cis) if not self.training: self.cache_k[i] = cache_k.detach() self.cache_v[i] = cache_v.detach() h = self.norm(h) output = self.output(h) return output @torch.no_grad() def generate( self, input_ids: torch.Tensor, attention_mask: torch.Tensor, max_new_tokens: int, temperature: float, top_p: float = 1.0, no_repeat_ngram_size=None, ): generated_tokens = [] for cur_pos in range(max_new_tokens): logits = self._forward(input_ids, attention_mask)[:, -1, :] if temperature > 0: probs = torch.softmax(logits / temperature, dim=-1) next_token = sample_top_p(probs, top_p) else: next_token = torch.argmax(logits, dim=-1) next_token = next_token.reshape(-1) input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1) attention_mask = torch.cat( [attention_mask, torch.ones_like(next_token).unsqueeze(1)], dim=1, ) generated_tokens.append(next_token) sequences = torch.concat( (input_ids, torch.stack(generated_tokens, dim=1)), dim=1 ) return sequences def setup_model_parallel() -> Tuple[int, int]: local_rank = int(os.environ.get("LOCAL_RANK", -1)) world_size = int(os.environ.get("WORLD_SIZE", -1)) print("local_rank:", local_rank, "world_size:", world_size) torch.distributed.init_process_group("nccl") initialize_model_parallel(world_size) torch.cuda.set_device(local_rank) # seed must be the same in all processes torch.manual_seed(1) return local_rank, world_size def setup_model_deepspeed() -> Tuple[int, int]: local_rank = int(os.environ.get("LOCAL_RANK", -1)) world_size = int(os.environ.get("WORLD_SIZE", -1)) deepspeed.init_distributed() torch.cuda.set_device(local_rank) # seed must be the same in all processes torch.manual_seed(1) return local_rank, world_size def load_checkpoints( ckpt_dir: str, local_rank: int, world_size: int ) -> Tuple[dict, dict]: checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) assert world_size == len(checkpoints), ( f"Loading a checkpoint for MP={len(checkpoints)} but world " f"size is {world_size}" ) ckpt_path = checkpoints[local_rank] print("Loading") checkpoint = torch.load(ckpt_path, map_location="cpu") with open(Path(ckpt_dir) / "params.json", "r") as f: params = json.loads(f.read()) return checkpoint, params def load_model( ckpt_dir: str, tokenizer_path: str, local_rank: int, world_size: int, froze_embeddings: bool, use_fairscale: bool, max_batch_size: int = 32, ) -> Tuple[Transformer, HFLikeTokenizer]: checkpoint, params = load_checkpoints(ckpt_dir, local_rank, world_size) model_args: ModelArgs = ModelArgs( max_seq_len=1024, max_batch_size=max_batch_size, **params ) model_args.froze_embeddings = froze_embeddings model_args.use_fairscale = use_fairscale tokenizer = Tokenizer(model_path=tokenizer_path) model_args.vocab_size = tokenizer.n_words torch.set_default_tensor_type(torch.cuda.HalfTensor) model = Transformer(model_args) torch.set_default_tensor_type(torch.FloatTensor) model.load_state_dict(checkpoint, strict=False) tokenizer = HFLikeTokenizer(tokenizer) return model, tokenizer def load_tokenizer(tokenizer_path: str): tokenizer = Tokenizer(model_path=tokenizer_path) return tokenizer def load_tokenizer_test(tokenizer_path: Optional[str] = None): tokenizer = MyTokenizer(model_path=tokenizer_path) return tokenizer def load_model_test( ckpt_dir: str, tokenizer_path: str, local_rank: int, world_size: int, froze_embeddings: bool, use_fairscale: bool, max_batch_size: int = 32, ) -> Tuple[Transformer, HFLikeTokenizer]: # test the model with hf tokenizer model_args = ModelArgs() model_args.froze_embeddings = froze_embeddings model_args.use_fairscale = use_fairscale tokenizer = MyTokenizer(model_path=tokenizer_path) model_args.vocab_size = tokenizer.n_words model = Transformer(model_args).cuda() tokenizer = HFLikeTokenizer(tokenizer) return model, tokenizer ================================================ FILE: optimization/chatllama/chatllama/rlhf/__init__.py ================================================ """RLHF implementation inspired to Lucidrains' implementation.""" ================================================ FILE: optimization/chatllama/chatllama/rlhf/actor.py ================================================ import json import yaml import os import shutil import deepspeed import torch from accelerate import Accelerator from beartype import beartype from beartype.typing import Tuple from einops import rearrange from peft import get_peft_model, LoraConfig, TaskType from torch.utils.data import DataLoader, Dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, ) from chatllama.rlhf.config import ConfigActor from chatllama.rlhf.model_list import ( hf_models_causal_lm, llama_models, hf_models, ) from chatllama.rlhf.model_loader import ModelLoader from chatllama.rlhf.utils import TrainingStats class ActorModel(torch.nn.Module): """Actor model that generates the augmented prompt from the initial user_input. The aim is to train this model to generate better prompts. Attributes: model: The model from LLaMA to be used tokenizer: The LLaMA tokenizer config (ConfigActor): Configuration for the actor model Methods: load: Load the model from a path save: Save the model to a path forward: Compute the action logits for a given sequence. generate: Generate a sequence from a given prompt """ def __init__(self, config: ConfigActor) -> None: super().__init__() # save config self.config = config # initialize the self.model if config.model in llama_models: # llama module might not be present when HF models are used from chatllama.llama_model import ( load_model, setup_model_parallel, ) # noqa local_rank, world_size = setup_model_parallel() # use load_model_test for testing self.model, self.tokenizer = load_model( ckpt_dir=config.model_folder, tokenizer_path=config.tokenizer_path, local_rank=local_rank, world_size=world_size, froze_embeddings=config.froze_embeddings, use_fairscale=config.use_fairscale, max_batch_size=config.batch_size, ) elif config.model in hf_models_causal_lm: self.tokenizer = self.load_tokenizer(config) self.model = AutoModelForCausalLM.from_pretrained( config.model, ) # Setup PEFT model if config.peft_enable: # check that the peft config exist if os.path.exists(config.peft_config_path): # Read the peft config from yaml with open(config.peft_config_path, "r") as c: config_peft = yaml.safe_load(c) else: raise ValueError( f"PEFT config {config.peft_config_path} not found" ) print(config_peft) # define lora config for peft peft_config = LoraConfig( task_type=TaskType.CAUSAL_LM, **config_peft ) # create peft model self.model = get_peft_model( model=self.model, peft_config=peft_config, ) self.model.to(config.device) else: raise ValueError(f"Model {config.model} not supported") # load the model from model_folder self.load() @beartype def load(self) -> None: """Load the model from the path""" # check if there is a model to load path = ModelLoader.check_model_path( config=self.config, is_checkpoint=False, current_epoch=None, ) # if there is a model to load if path is not None: # load the model print("Loading ...") model_dict = torch.load(path) self.model.load_state_dict(model_dict.get("state_dict") or model_dict.get("model")) @beartype def save(self) -> None: """Save the model to the path""" # get the path to save the model model_folder, model_name, path = ModelLoader.get_model_path( config=self.config, is_checkpoint=False, current_epoch=None, ) # save the model print(f"Saving model to {path} ...") torch.save( {"state_dict": self.model.state_dict()}, path, ) @staticmethod def load_tokenizer(config: ConfigActor): """Load the tokenizer from the model name""" if config.model in hf_models: # load the tokenizer from HF tokenizer = AutoTokenizer.from_pretrained( config.model, padding_side="left", padding=True, truncation=True, model_max_length=config.max_sequence_length, ) # add eos token if not present if tokenizer.eos_token is None: tokenizer.eos_token = "" tokenizer.eos_token_id = 2 # OPT eos-token-id # add pad token if not present if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token_id = tokenizer.eos_token_id elif config.model in llama_models: # llama module might not be present when HF models are used from chatllama.llama_model import ( load_tokenizer, ) # noqa tokenizer = load_tokenizer(config.tokenizer_path) return tokenizer def parameters(self): """Return the parameters of the model""" return self.model.parameters() @beartype def forward( self, sequences: torch.Tensor, sequences_mask: torch.Tensor ) -> torch.Tensor: """Generate logits to have probability distribution over the vocabulary of the actions Args: sequences (torch.Tensor): Sequences of states and actions used to compute token logits for the whole list of sequences attention_mask (torch.Tensor): Mask for the sequences attention Returns: logits (torch.Tensor): Logits for the actions taken """ model_output = self.model.forward( sequences, attention_mask=sequences_mask ) # need to return logits for the actions if self.config.model in hf_models_causal_lm: model_output = model_output.logits if self.config.debug: print("ActorModel.forward") print("model_output_logits shape", model_output.shape) print("model_output logits", model_output) return model_output @beartype @torch.no_grad() def generate( self, states: torch.Tensor, state_mask: torch.Tensor ) -> Tuple: """Generate actions and sequences=[states, actions] from state (i.e. input of the prompt generator model) Args: state (torch.Tensor): the input of the user state_mask (torch.Tensor): Mask for the state input (for padding) Returns: actions (torch.Tensor): Actions generated from the state sequences (torch.Tensor): Sequences generated from the state as [states, actions] """ # temperature for the actor temperature = self.config.temperature # max sequence length for the actor (i.e. prompt + completion) max_sequence_length = self.config.max_sequence_length # max and min number of tokens to generate max_tokens = self.config.max_tokens min_tokens = self.config.min_tokens # max generation possible given the state and the max sequence length max_generation_possible = max_sequence_length - states.shape[1] if max_generation_possible < min_tokens: raise ValueError( f"The prompt is too long w.r.t the " f"model sequence length \n" f"max_sequence_length={max_sequence_length}\n" f"state_length={states.shape[1]}\n" f"min_tokens={min_tokens}\n" f"max_tokens={max_tokens}\n" f"max_generation_possible={max_generation_possible}\n" ) # take the minimum the max_tokens and the max_generation_possible max_completion = min(max_tokens, max_generation_possible) sequences = self.model.generate( input_ids=states, attention_mask=state_mask, temperature=temperature, max_new_tokens=max_completion, no_repeat_ngram_size=3, ) actions = sequences[:, states.shape[1] :] # noqa E203 if self.config.debug: print( f"input length {states.shape[1]} \n" f"max sequence length {max_sequence_length} \n" f"max completion {max_completion} \n" f"generated sequence {sequences.shape[1]} \n" ) print("ActorModel.generate") print("state", states) print("state shape", states.shape) print("sequence shape", sequences.shape) print("sequence", sequences) print("actions shape", actions.shape) print("actions", actions) return actions, sequences class ActorDataset(Dataset): """Dataset for the pretraining of the actor model read a json file with the following format: [ { "user_input": "..." "completion": "..." }, ... ] Where: user_input: the input of the user completion: the output of the user """ def __init__( self, path: str, ) -> None: self.path = path with open(path, "r") as f: data = json.load(f) self.data = [d["user_input"] + d["completion"] for d in data] def __getitem__(self, idx): return self.data[idx] def __len__( self, ): return len(self.data) class ActorTrainer: """Used to pre-train the actor model to generate better prompts. Args: config (ConfigActor): Configuration for the actor model Attributes: config (ConfigActor): Configuration for the actor model model (ActorModel): Actor model loss_function (torch.nn.CrossEntropyLoss): Loss function optimizer (torch.optim.Adam): Optimizer validation_flag (bool): Flag to indicate if the validation dataset is provided train_dataset (ActorDataset): Training dataset train_dataloader (DataLoader): Training dataloader validation_dataset (ActorDataset): Validation dataset validation_dataloader (DataLoader): Validation dataloader scheduler (torch.optim.lr_scheduler): Learning rate scheduler training_stats (TrainingStats): Training statistics model_engine (ModelEngine): Model engine for deepspeed training accelerator (Accelerator): Accelerator for accelerate training Methods: train: Train the actor model load_checkpoint: Load a checkpoint save_checkpoint: Save a checkpoint """ def __init__(self, config: ConfigActor) -> None: # store config self.config = config # load the model self.actor = ActorModel(config) # define loss function self.loss_function = torch.nn.CrossEntropyLoss() # define optimizer self.optimizer = torch.optim.AdamW( self.actor.parameters(), lr=config.lr, weight_decay=1e-5 ) # check if validation dataset is provided self.validation_flag = False if config.validation_dataset_path is not None: self.validation_flag = True # create dataset and dataloaders self.train_dataset = ActorDataset(config.train_dataset_path) self.train_dataloader = DataLoader( self.train_dataset, batch_size=config.batch_size ) if self.validation_flag: self.eval_dataset = ActorDataset(config.validation_dataset_path) self.validation_dataloader = DataLoader( self.eval_dataset, batch_size=config.batch_size ) # define scheduler for the learning rate # learning rate is decreased until 10% of the initial value self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( self.optimizer, T_0=len(self.train_dataset) // config.batch_size, T_mult=1, eta_min=config.lr * 0.1, ) # define training statistics stat_path = ModelLoader.get_training_stats_path(config) self.training_stats = TrainingStats(stat_path) # consistency check between accelerate and deepspeed if config.accelerate_enable and config.deepspeed_enable: raise ValueError( "Both DeepSpeed and Accelerate are enabled for the Actor." "Please choose one of them." ) # initialize deepspeed self.model_engine = None if config.deepspeed_enable is True: if config.deepspeed_config_path is None: raise ValueError( "DeepSpeed config path is None, but deepspeed is enabled" ) if os.path.exists(config.deepspeed_config_path) is False: raise ValueError( f"DeepSpeed config path {config.deepspeed_config_path}" f"does not exist" ) ( self.model_engine, self.optimizer, self.train_dataloader, _, ) = deepspeed.initialize( args=None, model=self.actor, model_parameters=self.actor.parameters(), training_data=self.train_dataset, config=self.config.deepspeed_config_path, ) print("Training with DeepSpeed") # initialize accelerate self.accelerator = None if config.accelerate_enable is True: self.accelerator = Accelerator() ( self.actor, self.optimizer, self.train_dataloader, self.scheduler, ) = self.accelerator.prepare( self.actor, self.optimizer, self.train_dataloader, self.scheduler, ) print("Training with Accelerate") @beartype def save_checkpoint( self, current_epoch: int, current_step: int, max_epochs: int, max_steps: int, ) -> None: """Save the current checkpoint Args: current_epoch (int): Current epoch current_step (int): Current step max_epochs (int): Maximum number of epochs max_steps (int): Maximum number of steps """ print( f"Saving checkpoint for epoch {current_epoch + 1}, " f"step {current_step + 1} ..." ) # look for path to save the checkpoint model_folder, model_name, path = ModelLoader.get_model_path( config=self.config, is_checkpoint=True, current_epoch=current_epoch, current_step=current_step, max_epochs=max_epochs, max_steps=max_steps, ) # remove the checkpoint if it already exists if os.path.exists(path): if self.config.deepspeed_enable: shutil.rmtree(path) else: os.remove(path) if self.config.deepspeed_enable: client_state = { "epoch": current_epoch, "step": current_step, } self.model_engine.save_checkpoint(path, client_state=client_state) else: # save the checkpoint torch.save( { "state_dict": self.actor.model.state_dict(), "optim_state_dict": self.optimizer.state_dict(), "training_stats": self.training_stats, "epoch": current_epoch, "step": current_step, }, path, ) # remove old checkpoints n_checkpoints_to_keep = self.config.n_checkpoints_to_keep ModelLoader.delete_old_checkpoints( model_folder, model_name, n_checkpoints_to_keep ) @beartype def load_checkpoint( self, ) -> Tuple[int, int]: """Load a checkpoint from the model folder Returns: Tuple[int, int]: Current epoch and current step to resume training """ print("Looking for checkpoints...") # look for a checkpoint path = ModelLoader.check_model_path( config=self.config, is_checkpoint=True, current_epoch=None, ) # if there is a checkpoint if path is not None: print("Loading ...") if self.config.deepspeed_enable: # try to load the checkpoint try: _, client_state = self.model_engine.load_checkpoint(path) except Exception: print( "Checkpoint corrupted!" "Try to remove the last checkpoint." "Now Starting from epoch 0, step 0" ) return 0, 0 # load epoch and step to resume loops epoch = client_state["epoch"] step = client_state["step"] else: # try to load the checkpoint try: checkpoint = torch.load(path) except Exception: print( "Checkpoint corrupted!" "Try to remove the last checkpoint." "Now Starting from epoch 0, step 0" ) return 0, 0 # assing the checkpoint to the model epoch = checkpoint["epoch"] self.actor.model.load_state_dict(checkpoint["state_dict"]) self.optimizer.load_state_dict(checkpoint["optim_state_dict"]) self.trainign_stats = checkpoint["training_stats"] step = checkpoint["step"] return epoch, step + 1 # return the next episode to train return 0, 0 def add_eos_token( self, tokens: torch.Tensor, mask: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: # given tokens and mask, add eos token to the end of each sequence # and update the mask batch_size, seq_len = tokens.shape eos_token = self.actor.tokenizer.eos_token_id # see if i can append 1 token n_tokens_to_append = min(self.config.max_sequence_length - seq_len, 1) n_tokens_to_append = max(n_tokens_to_append, 0) # concatenate eos to tokens and mask if n_tokens_to_append > 0: tokens = torch.cat( [ tokens, torch.ones(batch_size, n_tokens_to_append) .long() .to(tokens.device) * eos_token, ], dim=1, ) mask = torch.cat( [ mask, torch.ones(batch_size, n_tokens_to_append) .long() .to(mask.device), ], dim=1, ) return tokens, mask def train( self, ) -> None: """Train the model""" print("Start Actor Model Pretraining") # get config parameters if self.config.deepspeed_enable: batch_size = self.train_dataloader.batch_size else: batch_size = self.config.batch_size epochs = self.config.epochs device = self.config.device checkpoint_steps = self.config.checkpoint_steps # compute the number of iterations n_iter = int(len(self.train_dataset) / batch_size) # load model_checkpoint start_epoch, start_step = self.load_checkpoint() if start_epoch == 0 and start_step == 0: self.training_stats.clear() # counter for the checkpoint cnt_checkpoint = 1 # traing loop for epoch in range(start_epoch, epochs): self.actor.train() for i, input_text in enumerate(self.train_dataloader): # skip the first steps if we are resuming training if i < start_step: continue # tokenize input with torch.no_grad(): input_tokenized = self.actor.tokenizer( input_text, return_tensors="pt", truncation=True, padding=True, ) # split tokens and mask input_tokenized_id = input_tokenized["input_ids"] input_tokenized_mask = input_tokenized["attention_mask"] # add eos token ( input_tokenized_id, input_tokenized_mask, ) = self.add_eos_token( input_tokenized_id, input_tokenized_mask, ) # split into input and output training_output = input_tokenized_id[:, 1:] training_input = input_tokenized_id[:, :-1] attention_mask = input_tokenized_mask[:, :-1] # move to device training_output = training_output.to(device) training_input = training_input.to(device) attention_mask = attention_mask.to(device) # forward pass if self.config.deepspeed_enable: est_output = self.model_engine( training_input, attention_mask ) else: est_output = self.actor(training_input, attention_mask) # compute loss est_output = rearrange(est_output, "b s v -> (b s) v") training_output = rearrange(training_output, "b s -> (b s)") loss = self.loss_function(est_output, training_output) self.training_stats.training_loss.append(loss.item()) # backward pass if self.config.deepspeed_enable: self.model_engine.backward(loss) self.model_engine.step() elif self.config.accelerate_enable: self.optimizer.zero_grad() self.accelerator.backward(loss) self.optimizer.step() self.scheduler.step() else: self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.scheduler.step() # print progress if i % self.config.iteration_per_print == 0: print( f"Epoch: {epoch+1}/{epochs}, " f"Iteration: {i+1}/{n_iter}, " f"Training Loss: {loss}" ) # save checkpoint periodically if cnt_checkpoint % checkpoint_steps == 0: self.save_checkpoint(epoch, i, epochs, n_iter) self.training_stats.save() cnt_checkpoint = 1 else: cnt_checkpoint += 1 # Validation if self.validation_flag: self.actor.eval() with torch.no_grad(): for i, input_text in enumerate(self.validation_dataloader): # tokenize input input_tokenized = self.actor.tokenizer( input_text, return_tensors="pt", padding=True ) validation_output = input_tokenized["input_ids"][:, 1:] validation_input = input_tokenized["input_ids"][:, :-1] attention_mask = input_tokenized["attention_mask"][ :, :-1 ] # forward pass est_output = self.actor.forward( validation_input, attention_mask ) validation_output = rearrange( validation_output, "b s -> (b s)" ) # compute loss est_output = rearrange(est_output, "b s v -> (b s) v") loss = self.loss_function( est_output, validation_output ) self.training_stats.validation_loss.append(loss.item()) # print progress if i % self.config.iteration_per_print == 0: print( f"Epoch: {epoch+1}/{epochs}, " f"Iteration: {i+1}/{n_iter}, " f"Validation Loss: {loss}" ) # reset start_step after training is resumed start_step = 0 # save the model self.actor.save() print("Training Finished ") ================================================ FILE: optimization/chatllama/chatllama/rlhf/config.py ================================================ import yaml import os from dataclasses import dataclass import torch from beartype import beartype from beartype.typing import Optional @dataclass class ConfigReward: """Config parameters for the reward model Attributes: device (torch.device): Device to be used for the reward model model (str): Model to be used for the reward model model_folder (str): Path to the folder where model are stored (used to load / store finetuned model or checkpoints) model_head_hidden_size (int): Hidden size of the reward model head max_sequence_length (int): Max sequence length of the reward model train_dataset_path (Optional[str]): Path to the training dataset. Default to None. To be specified only for the reward model trainig. validation_dataset_path (Optional[str]): Path to the validation dataset. Default to None. To be specified only for the reward model trainig. batch_size (Optional[int]): Batch size to train the reward model. Default to None. To be specified only for the reward model trainig. epochs (Optional[int]): Number of epochs to train the reward model. Default to None. To be specified only for the reward model trainig. iteration_per_print (Optional[int]): Number of iterations to print the training loss. Default to None. To be specified only for the reward model trainig. checkpoint_steps (Optional[int]): Number of steps (backProp) to interleave checkpoints. Default to None. To be specified only for the reward model trainig. checkpoint_name (Optional[str]): Name of the checkpoint. Default to None. lr (Optional[float]): Learning rate for the reward model. Default to None. To be specified only for the reward model distillation. llm_enable (bool): Enable reward model distillation. Default to True. Disable it if you dont have an API key. llm_model (Optional[str]): Model to be used for the reward model distillation. Default to "text-davinci-003". llm_temperature (Optional[float]): Temperature for the reward model distillation. Default to 0.9. llm_max_tokens (Optional[int]): Max tokens for the reward model distillation. Default to 64. deepspeed_enable (bool): Enable deepspeed for the reward model training. Default to False. deepspeed_config_path (str): Path to the deepspeed config file. Default to None. is_reward (bool): True if the model is a reward model. Default to True. accelerate_enable (bool): Enable accelerate for the reward model debug (bool): enable prints for Debugging """ device: torch.device model: str model_folder: str model_head_hidden_size: int max_sequence_length: int train_dataset_path: Optional[str] = None validation_dataset_path: Optional[str] = None batch_size: Optional[int] = None epochs: Optional[int] = None iteration_per_print: Optional[int] = None checkpoint_steps: Optional[int] = None checkpoint_name: Optional[str] = None lr: Optional[float] = None llm_enable: Optional[bool] = False llm_model: Optional[str] = "text-davinci-003" llm_temperature: Optional[float] = 0.9 llm_max_tokens: Optional[int] = 64 deepspeed_enable: bool = False deepspeed_config_path: Optional[str] = None # critic specific parameters is_reward: bool = True accelerate_enable: bool = False debug: bool = False # just for naming consistency ConfigCritic = ConfigReward @dataclass class ConfigActor: """Config parameters for models Attributes: model (str): Model to be used for the actor model_folder (str): Path to the folder where model are stored (used to load / store finetuned model or checkpoints) tokenizer_path (str): Path to the folder where tokenizer are stored train_dataset_path (str): Path to the training dataset validation_dataset_path (Optional[str]): Path to the validation dataset froze_embeddings (bool): Froze embeddings for the actor use_fairscale (bool): Use fairscale module for the actor instead of pytorch native modules. max_sequence_length (int): Max sequence length for the actor max_tokens (int): Max tokens for actor generation min_tokens (int): Min tokens for actor generation additonal_prompt_tokens (int): Number of tokens to be used as safety to avoid too large sequences and to add a template to the dataset temperature (float): Temperature for the actor batch_size (int): Batch size to train the actor iteration_per_print (int): Number of iterations to print the training loss lr (float): Learning rate for the actor epochs (int): Number of epochs to train the actor checkpoint_steps (int): Number of steps (backProp) to interleave checkpoints. n_checkpoints_to_keep (int): Number of checkpoints to keep for the actor. deepspeed_enable (bool): Enable deepspeed for the actor. Default to False. deepspeed_config_path (str): Path to the deepspeed config file. Default to None. accelerate_enable (bool): Enable accelerate for the actor device (torch.device): Device to be used for the actor checkpoint_name (Optional[str]): Name of the checkpoint. Default to None. peft_enable (bool): Enable peft for the actor peft_config_path (str): Path to the peft config file. debug (bool): Enable prints for debugging """ model: str model_folder: str tokenizer_path: str train_dataset_path: str validation_dataset_path: Optional[str] froze_embeddings: bool use_fairscale: bool max_sequence_length: int max_tokens: int min_tokens: int additonal_prompt_tokens: int temperature: float batch_size: int iteration_per_print: int lr: float epochs: int checkpoint_steps: int n_checkpoints_to_keep: int deepspeed_enable: bool deepspeed_config_path: Optional[str] accelerate_enable: bool device: torch.device peft_enable: bool peft_config_path: str checkpoint_name: Optional[str] = None debug: bool = False @dataclass class ConfigTrainer: """Config parameters for the trainer, used to configure the reinforcement learning training loop Attributes: actor_lr (float): Learning rate for the actor when training with reinforcement learning critic_lr (float): Learning rate for the critic when training with reinforcement learning actor_eps_clip (float): Epsilon clip for the actor critic_eps_clip (float): Epsilon clip for the critic beta_s (float): Beta for the actor and critic gamma (float): coefficient for the discounted rewards. examples_path (str): Path to the examples dataset num_episodes (int): Number of episodes, each episodes consist of a number of timesteps that are used to generate examples stored in the memory buffer. max_timesteps (int): Max timesteps for the actor and critic. for each timestep a set of examples are sampled and used to generate a completion and a reward. update_timesteps (int): Number of timesteps to update the actor and critic num_examples (int): Number of examples to generate for the actor and critic. For each iteration of timestep, num_examples are sampled from the prompt dataset, processed and stored in the memory buffer. batch_size (int): Batch size to train the actor and critic. This batch is used to aggregate the memory from the memory buffer for the actual training of the actor and critic models. epochs (int): Number of epochs to train the actor and critic. checkpoint_steps (int): Number of episodes to interleave checkpoints. device (torch.device): Device to be used for the actor and critic checkpoint_name (Optional[str]): Name of the checkpoint. Default to None. """ actor_lr: int critic_lr: int actor_eps_clip: float critic_eps_clip: float beta_s: float gamma_discounted: float examples_path: str num_episodes: int max_timesteps: int update_timesteps: int num_examples: int batch_size: int epochs: int checkpoint_steps: int device: torch.device checkpoint_name: Optional[str] = None debug: bool = False class Config: """Store the config parameters for the whole pipeline Args: trainer_dict (Optional[Dict]): Dictionary with the config parameters for the trainer. Default to None. If None, the config.yaml is used. actor_dict (Optional[Dict]): Dictionary with the config parameters for the actor. Default to None. If None, the config.yaml is used. critic_dict (Optional[Dict]): Dictionary with the config parameters for the critic. Default to None. If None, the config.yaml is used. reward_dict (Optional[Dict]): Dictionary with the config parameters for the reward. Default to None. If None, the config.yaml is used. device (Optional[torch.device]): Device to be used for the actor and critic. Default to None. If None, the device available is used. debug (Optional[bool]): Enable prints for debugging. Default to False. Attributes: trainer (ConfigTrainer): Config parameters for the trainer actor (ConfigActor): Config parameters for the actor critic (ConfigCritic): Config parameters for the critic reward (ConfigReward): Config parameters for the reward """ @beartype def __init__( self, path: str, device: Optional[torch.device] = None, debug: Optional[bool] = False, ) -> None: # if not specified use the device available if device is None: if torch.cuda.is_available(): device = torch.device("cuda") else: raise ValueError("No GPU available") print(f"Current device used :{str(device)}") if path is None or os.path.exists(path) is False: raise ValueError("Path to the config.yaml is not valid") # Read the config from yaml with open(path, "r") as c: config = yaml.safe_load(c) trainer_dict = config["trainer_config"] actor_dict = config["actor_config"] critic_dict = config["critic_config"] reward_dict = config["reward_config"] # Trainer Config trainer_dict["device"] = device trainer_dict["debug"] = debug self.trainer = ConfigTrainer(**trainer_dict) # Actor Config actor_dict["device"] = device actor_dict["debug"] = debug self.actor = ConfigActor(**actor_dict) # Critic Config critic_dict["device"] = device critic_dict["debug"] = debug self.critic = ConfigCritic(**critic_dict) self.critic.is_reward = False # Reward Config reward_dict["device"] = device reward_dict["debug"] = debug self.reward = ConfigReward(**reward_dict) ================================================ FILE: optimization/chatllama/chatllama/rlhf/dataset.py ================================================ import json import os import numpy as np from beartype.typing import Dict, List, Union from datasets import load_dataset from chatllama.rlhf.config import Config, ConfigActor, ConfigReward from chatllama.rlhf.reward import RewardModel, CriticModel from chatllama.rlhf.actor import ActorModel ConfigType = Union[Config, ConfigActor, ConfigReward] class BaseDataset: def __init__( self, ) -> None: pass @staticmethod def sort_conversation( conversations: List[Dict], only_input: bool = False, reverse: bool = True, shuffle: bool = True, ) -> List[Dict]: """Sort the conversations by length of user_input + completion or by length of user_input only Args: conversations (List[Dict]): list of conversations only_input (bool, optional): sort by length of user_input only. Defaults to False. reverse (bool, optional): sort in descending order. Defaults to True. shuffle (bool, optional): shuffle the dataset leaving only the first 100 samples sorted. Defaults to True. Returns: List[Dict]: sorted list of conversations """ # define the sorting function if only_input is True: def sort_fun(x): return len(x["user_input"]) else: def sort_fun(x): return len(x["user_input"]) + len(x["completion"]) # sort conversations = sorted( conversations, key=sort_fun, reverse=reverse, ) # shuffle if shuffle is True: conversations = ( conversations[:10] + np.random.choice( conversations[10:], size=len(conversations[10:]), replace=False, ).tolist() ) return conversations @staticmethod def take_n_samples( conversations: List[Dict], n: int, ) -> List[Dict]: """Take N samples from the dataset Args: conversations (List[Dict]): list of conversations n (int): number of samples to take randomly Returns: List[Dict]: list of N samples """ # sample N number of index from 0 to len(conversations) indexes = np.random.choice(len(conversations), size=n, replace=False) # take the samples conversations = [conversations[i] for i in indexes] return conversations @staticmethod def clean_dataset(config: ConfigType): """Clean the datasets by removing too long examples The Reward Dataset constraints are: - user_input + completion < Reward model max sequence length The Actor Dataset constraints are: - user_input + completion < Actor model max sequence length The RLHF Training Dataset constraints are: - user_input + min_completion < Actor model max sequence length - user_input + min_completion < Critic model max sequence length - user_input + min_completion < Reward model max sequence length Args: config (Config): config object """ if isinstance(config, Config): print("Start cleaning the dataset for RLHF") # constraints r_model_max_seq_len = config.reward.max_sequence_length a_model_max_seq_len = config.actor.max_sequence_length c_model_max_seq_len = config.critic.max_sequence_length min_completion = config.actor.min_tokens # dataset dataset_path = config.trainer.examples_path # tokenizers r_tokenizer = RewardModel.load_tokenizer(config.reward) a_tokenizer = ActorModel.load_tokenizer(config.actor) c_tokenizer = CriticModel.load_tokenizer(config.critic) # safety tokens safety_tokens = config.actor.additonal_prompt_tokens elif isinstance(config, ConfigActor): print("Start cleaning the dataset for Actor") # constraint a_model_max_seq_len = config.max_sequence_length # dataset dataset_path = config.train_dataset_path # tokenizer a_tokenizer = ActorModel.load_tokenizer(config) # safety tokens safety_tokens = config.additonal_prompt_tokens elif isinstance(config, ConfigReward): print("Start cleaning the dataset for Reward") # constraint r_model_max_seq_len = config.max_sequence_length # dataset dataset_path = config.train_dataset_path # tokenizer r_tokenizer = RewardModel.load_tokenizer(config) # if there is the datasets if os.path.exists(dataset_path): # load the dataset with open(dataset_path, "r") as f: conversations = json.load(f) # sort in desceding order - longest first if isinstance(config, Config): conversations = BaseDataset.sort_conversation( conversations, only_input=True, reverse=True, ) else: conversations = BaseDataset.sort_conversation( conversations, only_input=False, reverse=True, ) old_len = len(conversations) # remove too long examples # since datasets are ordered by the length # we can remove the first elements until we find # an example that is not too long while len(conversations) > 0: # get the text to be tokenized if isinstance(config, Config): text = conversations[0]["user_input"] else: text = ( conversations[0]["user_input"] + conversations[0]["completion"] ) # remove elements from RLHF dataset if isinstance(config, Config): a_tokens = a_tokenizer.encode(text, truncation=False) r_tokens = r_tokenizer.encode(text, truncation=False) c_tokens = c_tokenizer.encode(text, truncation=False) if ( len(a_tokens) + min_completion + safety_tokens > a_model_max_seq_len ): conversations.pop(0) elif ( len(r_tokens) + min_completion + safety_tokens > r_model_max_seq_len ): conversations.pop(0) elif ( len(c_tokens) + min_completion + safety_tokens > c_model_max_seq_len ): conversations.pop(0) else: break # remove elements from Actor dataset elif isinstance(config, ConfigActor): tokens = a_tokenizer.encode(text, truncation=False) if len(tokens) + safety_tokens > a_model_max_seq_len: conversations.pop(0) else: break # remove elements from Reward dataset elif isinstance(config, ConfigReward): tokens = r_tokenizer.encode(text, truncation=False) if len(tokens) > r_model_max_seq_len: conversations.pop(0) else: break # if the number of examples has changed if len(conversations) != old_len: print("Number of examples before cleaning: ", old_len) print( "Number of examples after cleaning: ", len(conversations) ) # remove the old dataset os.remove(dataset_path) # save the new dataset with open(dataset_path, "w") as f: json.dump(conversations, f, indent=4) else: print("Dataset is already clean") else: print( f"Dataset not found at {dataset_path}" f" Skipping cleaning of the dataset" ) class StanfordNLPSHPDataset(BaseDataset): """Class for Stanford NLP SHP dataset from HuggingFace""" def __init__( self, ) -> None: print("Download the dataset") self.dataset = load_dataset("stanfordnlp/SHP") print("Download Completed") def reformat_dataset(self, data: List) -> List[Dict]: """Reformat the dataset to the format required by RLHF Args: data (List): dataset from HuggingFace Returns: List[Dict]: reformatted dataset """ # initialize conversations conversations = [] # loop over the dataset for i, d in enumerate(data): if d["score_A"] > d["score_B"]: response = d["human_ref_A"] else: response = d["human_ref_B"] # compose user_input template user_input = d["history"].rstrip("\n") user_input = "Human: " + d["history"] + "\n\n##\n\n" # compose completion template completion = "Assistant: " + response conv = { "user_input": user_input, "completion": completion, "score": None, } conversations.append(conv) return conversations def save_dataset( self, dataset_folder: str, number_of_samples: int, reverse: bool = True ) -> None: """Save the dataset in the format required by RLHF Args: dataset_folder (str): path to the folder where the dataset will be saved number_of_samples (int): number of samples to take from the dataset reverse (bool, optional): sort the dataset in descending order. Defaults to True. """ print("Generate datasets for RLHF") # take the train and test dataset to create the finetuning dataset conversations = self.reformat_dataset(self.dataset["train"]) conversations.extend(self.reformat_dataset(self.dataset["test"])) # sort conversations by length of user_input + completion conversations = self.sort_conversation(conversations, reverse=reverse) # save actor training data with open(f"{dataset_folder}/actor_training_data.json", "w") as f: json.dump(conversations, f, indent=4) # take N samples and sort them conversations = self.take_n_samples(conversations, number_of_samples) conversations = self.sort_conversation(conversations, reverse=reverse) # save reward training data with open(f"{dataset_folder}/reward_training_data.json", "w") as f: json.dump(conversations, f, indent=4) # take the validation dataset for rlhf conversations = self.reformat_dataset(self.dataset["validation"]) # sort the validation dataset conversations = self.sort_conversation( conversations, only_input=True, reverse=reverse, ) # save rlhf training data with open(f"{dataset_folder}/rlhf_training_data.json", "w") as f: json.dump(conversations, f, indent=4) print("Generation Completed") class AnthropicRLHF(BaseDataset): def __init__( self, ) -> None: print("Download the dataset") self.dataset = load_dataset("Anthropic/hh-rlhf") print("Download Completed") def reformat_dataset(self, data: List) -> List[Dict]: """Reformat the dataset to the format required by RLHF Args: data (List): dataset from HuggingFace Returns: List[Dict]: reformatted dataset """ conversations = [] for _, d in enumerate(data): current_conv = d["chosen"] split_answer = current_conv.split("Assistant:") # take all the list element in split_answer except the last one # and joing them with "Assistant:" in a unique string previous_convers = split_answer[0] for i, s in enumerate(split_answer[1:-1]): previous_convers += "Assistant:" + s # remove the last characters if they are "\n" from the previous # conversation previous_convers = previous_convers.rstrip("\n") user_input = previous_convers + "\n\n##\n\n" completion = "Assistant: " + split_answer[-1] conv = { "user_input": user_input, "completion": completion, "score": None, } conversations.append(conv) return conversations def save_dataset( self, dataset_folder: str, number_of_samples: int, reverse: bool = True ) -> None: """Save the dataset in the format required by RLHF Args: dataset_folder (str): path to the folder where the dataset will be saved number_of_samples (int): number of samples to take from the dataset reverse (bool, optional): sort the dataset in descending order. Defaults to True. """ print("Generate datasets for RLHF") # generate actor and reward dataset conversations = self.reformat_dataset(self.dataset["train"]) conversations = self.sort_conversation(conversations, reverse=reverse) # save actor training data with open(f"{dataset_folder}/actor_training_data.json", "w") as f: json.dump(conversations, f, indent=4) # sample N number of index from 0 to len(conversations) conversations = self.take_n_samples(conversations, number_of_samples) conversations = self.sort_conversation(conversations, reverse=reverse) # save reward training data with open(f"{dataset_folder}/reward_training_data.json", "w") as f: json.dump(conversations, f, indent=4) # rlhf dataset conversations = self.reformat_dataset(self.dataset["test"]) # sort conversations by length of user_input conversations = self.sort_conversation( conversations, only_input=True, reverse=reverse ) # save rlhf training data with open(f"{dataset_folder}/rlhf_training_data.json", "w") as f: json.dump(conversations, f, indent=4) print("Generation Completed") ================================================ FILE: optimization/chatllama/chatllama/rlhf/model_list.py ================================================ # llama models llama_models = ["llama-7B", "llama-13B", "llama-33B", "llama-65B"] # HF Models # encoder-decoder models TODO: still not supported hf_models_seq_2_seq = [ "google/flan-t5-xxl", "google/flan-t5-xl", "google/flan-t5-large", "google/flan-t5-base", "google/flan-t5-small", ] # decoder only TODO: codegen is still broken hf_models_causal_lm = [ "facebook/opt-125m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-11b", "facebook/galactica-125m", "facebook/galactica-1.3b", "facebook/galactica-6.7b", "bigscience/bloom-560m", "bigscience/bloomz-560m", "bigscience/bloom-1b1", "bigscience/bloomz-1b1", "bigscience/bloom-1b7", "bigscience/bloomz-1b7", "bigscience/bloom-3b", "bigscience/bloomz-3b", "bigscience/bloom-7b1", "bigscience/bloomz-7b1", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neox-20b", "EleutherAI/gpt-j-6B", "gpt2", "gpt2-large", "gpt2-xl", "benjamin/gerpt2", "benjamin/gerpt2-large", "Salesforce/codegen-350M-mono", "Salesforce/codegen-2B-mono", "Salesforce/codegen-6B-mono", "Salesforce/codegen-16B-mono", ] # create a list of all the models from hf hf_models = hf_models_seq_2_seq + hf_models_causal_lm ================================================ FILE: optimization/chatllama/chatllama/rlhf/model_loader.py ================================================ import os import shutil from beartype.typing import Union, Optional, Tuple from chatllama.rlhf.config import ( Config, ConfigActor, ConfigCritic, ConfigReward, ) from chatllama.rlhf.model_list import hf_models ConfigType = Union[Config, ConfigActor, ConfigCritic, ConfigReward] class ModelLoader: """Class to load and save models and their checkpoints during training.""" def __init__( self, ) -> None: pass @staticmethod def get_training_stats_path(config: ConfigType) -> str: """Method to get the path to the training stats file. Used when saving Args: config (ConfigType): the config object """ model_folder, model_name, path = ModelLoader.get_model_path( config, is_checkpoint=True ) stat_path = os.path.join(model_folder, "training_stats.json") return stat_path @staticmethod def look_for_last_checkpoint( model_folder: str, model_name: str, ) -> Optional[str]: """Method to look for the last checkpoint in the model folder checkpoint are saved as {model_name}_epoch_{current_epoch}.pt Args: model_folder (str): the folder where the checkpoints are saved model_name (str): the name of the model """ # remove .pt to model name model_name = model_name.split(".")[0] checkpoints = [ f for f in os.listdir(model_folder) if f.startswith(model_name) ] if len(checkpoints) == 0: return None else: checkpoints = sorted(checkpoints) # get last checkpoint last_checkpoint = checkpoints[-1] return last_checkpoint @staticmethod def look_for_checkpoint_by_name( model_folder: str, checkpoint_name: str, ) -> Optional[str]: """Method to look for a particular checkpoint in the model folder checkpoint are saved as {model_name}_epoch_{current_epoch}_steps_{current_steps}.pt Args: model_folder (str): the folder where the checkpoints are saved checkpoint_name (str): the name of the checkpoint """ # look for a file named checkpoint_name in the model folder path = os.path.join(model_folder, checkpoint_name) if os.path.exists(path): return checkpoint_name else: return None @staticmethod def get_checkpoint_name(config: ConfigType) -> str: if isinstance(config, Config): return config.trainer.checkpoint_name else: return config.checkpoint_name @staticmethod def get_base_model_folder_from_config(config: ConfigType) -> str: if isinstance(config, ConfigActor) or isinstance(config, ConfigReward): return config.model_folder elif isinstance(config, Config): return config.actor.model_folder else: raise ValueError( "Config type not recognized during saving or loading" ) @staticmethod def get_model_type_from_config(config: ConfigType) -> str: if isinstance(config, ConfigReward): # here use ad-hoc flag from config to distinguish between # reward and critic if config.is_reward: return "reward" else: return "critic" elif isinstance(config, ConfigActor): return "actor" elif isinstance(config, Config): return "actor_rl" @staticmethod def get_model_name_from_config(config: ConfigType) -> str: model_name = None if isinstance(config, Config): model_name = config.actor.model elif isinstance(config, ConfigReward) or isinstance( config, ConfigActor ): model_name = config.model if model_name in hf_models: return os.path.split(model_name)[-1] if model_name is None: raise ValueError("Model name not found") return model_name @staticmethod def delete_old_checkpoints( model_folder: str, model_name: str, n_ckp_to_keep: int = 5 ): """Method to discard old checkpoints, keeping only the last n_ckp_to_keep Args: model_folder (str): the folder where the checkpoints are saved model_name (str): the name of the model n_ckp_to_keep (int): the number of checkpoints to keep """ # remove .pt to model name model_name = model_name.split(".")[0] checkpoints = [ f for f in os.listdir(model_folder) if f.startswith(model_name) ] if len(checkpoints) == 0: return else: checkpoints = sorted(checkpoints) # check if the number of checkpoint is greater than 5 if len(checkpoints) > n_ckp_to_keep: for c in checkpoints[:-n_ckp_to_keep]: checkpoint_path = os.path.join(model_folder, c) os.remove(checkpoint_path) @staticmethod def get_model_path( config: ConfigType, is_checkpoint: bool = False, current_epoch: Optional[int] = None, current_step: Optional[int] = None, max_epochs: int = 1_000_000_000, max_steps: int = 1_000_000_000, ) -> Tuple[str, str, Optional[str]]: """Method to get the path to the right model file. Used when saving the model. The hierarchy of the model folder is: -- model_folder: here store the models trained, for each type of model there is a dedicated folder -- actor -- critic -- reward -- actor_rl -- checkpoints: here store the checkpoints during training, for each type of model there is a dedicated folder -- actor -- critic -- reward -- actor_rl Args: config (ConfigType): the config object, contains info of the model is_checkpoint (bool): if True, the path is for a checkpoint current_epoch (Optional[int]): the current epoch, used to create the checkpoint name. If is_checkpoint is True, and current_epoch is None, return just the folder and the simple model name for the possible checkpoint. current_step (Optional[int]): the current step, used to create the checkpoint name. max_epochs (Optional[int]): the maximum number of epochs, used to create the checkpoint name. max_steps (Optional[int]): the maximum number of steps, used to create the checkpoint name. Returns: model_folder (str): the folder where the model is saved model_name (str): the name of the model path (Optional[str]): the path to the model. If is_checkpoint is True, and current_epoch is None, return None """ model_folder = ModelLoader.get_base_model_folder_from_config(config) # Add the checkpoint path if necessary if is_checkpoint: model_folder = os.path.join(model_folder, "checkpoints") # Create the folder for the model type # (Actor, Critic, Reward, Actor_RL) model_type = ModelLoader.get_model_type_from_config(config) model_folder = os.path.join(model_folder, model_type) # Make the path if not exists if os.path.exists(model_folder) is False: os.makedirs(model_folder, exist_ok=True) print(f"Model folder does not exist. Creating it: {model_folder}") # Create the model name model_name = ModelLoader.get_model_name_from_config(config) # If is a checkpoint and current epoch are available # extend the model name with the epoch, if none epoch is provided # just return the simple model name if is_checkpoint and current_epoch is not None: # number of characters to store the checkpoints n_char = max(len(str(max_epochs)), len(str(max_steps))) # create the string epoch such that it is always the same length # equalt to n_char (i.e. 00000001) necessary for sorting string_epoch = str(current_epoch) string_epoch = "0" * (n_char - len(string_epoch)) + string_epoch string_epoch = f"_epoch_{string_epoch}" if current_step is not None: string_step = str(current_step) string_step = "0" * (n_char - len(string_step)) + string_step string_step = f"_step_{string_step}" model_name = f"{model_name}{string_epoch}{string_step}.pt" else: model_name = f"{model_name}{string_epoch}.pt" else: model_name = f"{model_name}.pt" # if the epoch is not provided, and it is a checkpoint # is impossible to know the path to the file. # but we can know the model folder and the model name if is_checkpoint and current_epoch is None: path = None else: path = os.path.join(model_folder, model_name) return model_folder, model_name, path @staticmethod def check_model_path( config: ConfigType, is_checkpoint: bool = False, current_epoch: Optional[int] = None, current_step: Optional[int] = None, ) -> Optional[int]: """Method to check if the model path exists to load models or checkpoints. Args: config (ConfigType): the config object, contains info of the model is_checkpoint (bool): if True, the path is for a checkpoint current_epoch (Optional[int]): the current epoch. is is_checkpoint is True, and current_epoch is None, it will look for the last checkpoint and return it. Returns: path (Optional[str]): the path to the model. If is_checkpoint is True, and current_epoch is None, search for the last checkpoint and return it. If no checkpoint is found, return None. epoch (Optional[int]): the epoch of the checkpoint if an actual checkpoint is found. If no checkpoint is found, return None. """ model_folder, model_name, path = ModelLoader.get_model_path( config, is_checkpoint, current_epoch, ) # If i am looking for a checkpoint. if is_checkpoint and current_epoch is None: # If the checkpoint is specified by name use it checkpoint_name = ModelLoader.get_checkpoint_name(config) if checkpoint_name is not None: checkpoint = ModelLoader.look_for_checkpoint_by_name( model_folder, checkpoint_name ) else: checkpoint = ModelLoader.look_for_last_checkpoint( model_folder, model_name ) if checkpoint is not None: path = os.path.join(model_folder, checkpoint) # Get the epoch number from the checkpoint name if path is not None: if os.path.exists(path) is False: path = None if path is None: if is_checkpoint: checkpoint_name = ModelLoader.get_checkpoint_name(config) if checkpoint_name is not None: print( f"No checkpoint found at {model_folder} " f"with name {config.checkpoint_name}" ) else: print( f"No previous checkpoint found at " f"{model_folder} for {model_name}" ) else: print( f"No previous model found at " f"{model_folder} for model {model_name}" ) else: if is_checkpoint: # the name is modelname_epoch_00000001_step_00000001.pt # or modelname_epoch_00000001.pt if "_step_" in path: epoch = int(path.split("_epoch_")[-1].split("_")[0]) step = int(path.split("_step_")[-1].split(".")[0]) print( f"Found checkpoint for epoch {epoch + 1}," f" step {step + 1}..." ) else: epoch = int(path.split("_epoch_")[-1].split(".")[0]) print(f"Found checkpoint for epoch {epoch + 1} ...") else: print(f"Found model at {path}") return path def init_critic_from_reward(config: ConfigCritic) -> None: """Method to initialize the critic from the reward model. If the critic folder is empty """ if config.is_reward is True: raise ValueError( "The config should work for the Critic model," "but the config seems to be for the Reward model" ) # check that the critic folder is empty path = ModelLoader.check_model_path(config) _, _, critic_path = ModelLoader.get_model_path(config) if path is None: print("Initializing Critic from Reward model...") config.is_reward = True path = ModelLoader.check_model_path(config) if path is not None: _, _, reward_path = ModelLoader.get_model_path(config) # copy the file in reward_path to critic_path shutil.copy(reward_path, critic_path) else: print("Critic Model remains uninitialized") config.is_reward = False ================================================ FILE: optimization/chatllama/chatllama/rlhf/reward.py ================================================ import json import shutil import os import deepspeed import torch from accelerate import Accelerator from beartype import beartype from beartype.typing import Iterable, Tuple from einops.layers.torch import Rearrange from torch.utils.data import Dataset, DataLoader from transformers import ( AutoModel, AutoTokenizer, ) from chatllama.rlhf.config import ConfigReward from chatllama.rlhf.model_list import hf_models from chatllama.rlhf.model_loader import ModelLoader from chatllama.rlhf.utils import TrainingStats class RewardModel(torch.nn.Module): """Model to be trained to predict the reward for RL. or to be used as Critic in RL. It is a Language Model with a head that predicts the reward (a scalar) for a given sequence of tokens. Attributes: model (torch.nn.Module): Model to be used for the reward model tokenizer (torch.nn.Module): Tokenizer to be used for the reward model head (torch.nn.Module): Head to be used for the reward model config (ConfigReward): Config parameters for the reward model Methods: load_tokenizer: Load the tokenizer for the reward model forward: Forward pass of the model (used by the critic) save: Save the model load: Load the model get_reward: Get the reward for a given input (used by the reward model) parameters: Return the parameters of the reward model """ def __init__(self, config: ConfigReward) -> None: super().__init__() # store config self.config = config # initialize the self.model head_hidden_size = config.model_head_hidden_size if config.model in hf_models: self.tokenizer = self.load_tokenizer(config) self.model = AutoModel.from_pretrained(config.model) head_dim = self.model.config.hidden_size if config.model.startswith("gpt2"): head_dim = self.model.config.n_embd self.head = torch.nn.Sequential( torch.nn.Linear(head_dim, head_hidden_size), torch.nn.ReLU(), torch.nn.Linear(head_hidden_size, 1), Rearrange("... 1 -> ..."), ) else: raise ValueError(f"Model {config.model} not supported") # load the model self.load() # freeze model parameters (only train the head) # for param in self.model.parameters(): # param.requires_grad = False # move model to device self.model.to(config.device) self.head.to(config.device) @staticmethod def load_tokenizer(config: ConfigReward): # load tokenizer from HF tokenizer = AutoTokenizer.from_pretrained( config.model, padding_side="left", padding=True, truncation=True, model_max_length=config.max_sequence_length, ) # add eos token if not present if tokenizer.eos_token is None: tokenizer.eos_token = "" tokenizer.eos_token_id = 2 # OPT eos token id # add pad token if not present if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token_id = tokenizer.eos_token_id return tokenizer @beartype def load(self) -> None: """Load the model from the path""" # look for a pretrained model path = ModelLoader.check_model_path( config=self.config, is_checkpoint=False, current_epoch=None, ) # check if the model exists if path is not None: # load the model from the path print("Loading ...") model_dict = torch.load(path) self.model.load_state_dict(model_dict.get("state_dict") or model_dict.get("model")) self.head.load_state_dict(model_dict["head"]) @beartype def save(self) -> None: """Save the model to the path""" # get the path to save the model model_folder, model_name, path = ModelLoader.get_model_path( config=self.config, is_checkpoint=False, current_epoch=None, ) # save the model print(f"Saving model to {path} ...") torch.save( {"model": self.model.state_dict(), "head": self.head.state_dict()}, path, ) @beartype def parameters( self, ) -> Iterable[torch.nn.Parameter]: """Return the parameters of the reward model""" for p in self.model.parameters(): yield p for p in self.head.parameters(): yield p @beartype def forward( self, output_sequence: torch.Tensor, output_sequence_mask: torch.Tensor ) -> torch.Tensor: """Generate the sequence of rewards for the given output sequence what is the quality of the output sequence tokens? Args: output_sequence (torch.Tensor): The sequence of tokens to be evaluated output_sequence_mask (torch.Tensor): Mask for the attention Returns: torch.Tensor: Rewards for the given output sequence """ output = self.model( output_sequence, attention_mask=output_sequence_mask ) # What if the output_sequence is longer than the max context of # the model? rewards = self.head(output.last_hidden_state) if self.config.debug: print("RewardModel.forward") print("output_sequence.shape", output_sequence.shape) print("output_sequence", output_sequence) print("reward.shape", rewards.shape) print("reward", rewards) return rewards @beartype def get_reward( self, output_sequence: torch.Tensor, output_sequence_mask: torch.Tensor ) -> torch.Tensor: """Get the reward for the given output sequence Args: output_sequence (torch.Tensor): The concatenation of initial input and actor output as tokens output_sequence_mask (torch.Tensor): Mask for the attention """ if output_sequence.shape[1] > self.config.max_sequence_length: raise ValueError( f"Output sequence is too long: {output_sequence.shape[1]}" f" > {self.config.max_sequence_length}" ) rewards = self.forward(output_sequence, output_sequence_mask) return rewards[:, -1] # just to keep namings consistent CriticModel = RewardModel class RewardDataset(Dataset): """Dataset class for the reward model read a json file with the following format: [ { "user_input": "...", "completion": "...", "score": ... }, ... ] Where: user_input: the initial input of the user completion: the completion generated by the model score: the score given by the user to the completion (or by the LLM) """ def __init__(self, path: str) -> None: print(f"Loading dataset from {path}") with open(path, "r") as f: self.data = list(json.load(f)) print(f"Loaded {len(self.data)} samples") def __getitem__(self, idx: int): user_input = self.data[idx]["user_input"] completion = self.data[idx]["completion"] if self.data[idx]["score"]: score = float(self.data[idx]["score"]) else: score = 2.5 item = (user_input + completion, score) return item def __len__( self, ): return len(self.data) class RewardTrainer: """Class to train the reward model Args: config (ConfigModel): Config parameters for the model Attributes: model (RewardModel): Reward model config (ConfigModel): Config parameters for the model optimizer (torch.optim): Optimizer for the model loss_function (torch.nn): Loss function for the model validation_flag (bool): Flag to indicate if the validation dataset is available train_dataset (RewardDataset): Dataset for training validation_dataset (RewardDataset): Dataset for validation train_dataloader (DataLoader): Dataloader for training validation_dataloader (DataLoader): Dataloader for validation scheduler (torch.optim.lr_scheduler): Scheduler for the optimizer training_stats (List[Dict]): List of dictionaries with the training statistics model_engine (ModelEngine): Model engine to train the model using deepspeed accelerator (Accelerator): Accelerator to train the model using accelerate by HF. Methods: train: Train the reward model save_checkpoints: Save the checkpoints of the model load_checkpoints: Load the checkpoints of the model """ def __init__(self, config: ConfigReward) -> None: # save the config self.config = config # load the model self.reward = RewardModel(config) # optimizer self.optimizer = torch.optim.AdamW( self.reward.parameters(), lr=config.lr ) # loss function self.loss_function = torch.nn.MSELoss() # check validation dataset self.validation_flag = False if config.validation_dataset_path is not None: self.validation_flag = True # create dataset and dataloaders self.train_dataset = RewardDataset(config.train_dataset_path) self.train_dataloader = DataLoader( self.train_dataset, batch_size=config.batch_size ) if self.validation_flag: self.eval_dataset = RewardDataset(config.validation_dataset_path) self.validation_dataloader = DataLoader( self.eval_dataset, batch_size=config.batch_size ) # intilize scheduler - learning rate will drop to 10% of the initial # value self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( self.optimizer, T_0=len(self.train_dataset) // config.batch_size, T_mult=1, eta_min=config.lr * 0.1, last_epoch=-1, ) # initialize training stats stats_path = ModelLoader.get_training_stats_path(config) self.training_stats = TrainingStats(stats_path) # consistency check between accelerate and deepspeed if config.accelerate_enable and config.deepspeed_enable: raise ValueError( "Both DeepSpeed and Accelerate are enabled for the Reward." "Please choose one of them." ) # initialize deepspeed self.model_engine = None if config.deepspeed_enable is True: if config.deepspeed_config_path is None: raise ValueError( "DeepSpeed config path is None, but deepspeed is enabled" ) if os.path.exists(config.deepspeed_config_path) is False: raise ValueError( f"DeepSpeed config path {config.deepspeed_config_path}" f"does not exist" ) ( self.model_engine, self.optimizer, self.train_dataloader, self.scheduler, ) = deepspeed.initialize( args=None, model=self.reward, model_parameters=self.reward.parameters(), training_data=self.train_dataset, config=self.config.deepspeed_config_path, ) print("Training with DeepSpeed") # initialize accelerate self.accelerator = None if config.accelerate_enable is True: self.accelerator = Accelerator() ( self.reward, self.optimizer, self.train_dataloader, self.scheduler, ) = self.accelerator.prepare( self.reward, self.optimizer, self.train_dataloader, self.scheduler, ) print("Training with Accelerate") @beartype def save_checkpoint( self, current_epoch: int, current_step: int, max_epochs: int, max_steps: int, ) -> None: """Save the checkpoints of the model Args: current_epoch (int): Current epoch current_step (int): Current step max_epochs (int): Maximum number of epochs max_steps (int): Maximum number of steps """ print( f"Saving checkpoint for epoch {current_epoch + 1}, " f" step {current_step} ..." ) # get the path to save the checkpoint model_folder, model_name, path = ModelLoader.get_model_path( config=self.config, is_checkpoint=True, current_epoch=current_epoch, current_step=current_step, max_epochs=max_epochs, max_steps=max_steps, ) # remove the checkpoint if it already exists if os.path.exists(path): if self.config.deepspeed_enable: shutil.rmtree(path) else: os.remove(path) # save the checkpoint if self.config.deepspeed_enable: client_state = { "epoch": current_epoch, "step": current_step, } self.model_engine.save_checkpoint(path, client_state=client_state) else: torch.save( { "state_dict": self.reward.model.state_dict(), "optim_state_dict": self.optimizer.state_dict(), "scheduler_state_dict": self.scheduler.state_dict(), "training_stats": self.training_stats, "epoch": current_epoch, "step": current_step, }, path, ) @beartype def load_checkpoint( self, ) -> Tuple[int, int]: """Load the checkpoints of the model Returns: Tuple[int, int]: The current epoch and step from which you should resume the training """ print("Looking for checkpoints...") # look for the checkpoints path = ModelLoader.check_model_path( config=self.config, is_checkpoint=True, current_epoch=None, ) # check if a checkpoint exists if path is not None: print("Loading ...") if self.config.deepspeed_enable: # try to load the checkpoint try: _, client_state = self.model_engine.load_checkpoint(path) except Exception: print( "Checkpoint corrupted!" "Try to remove the last checkpoint." "Now Starting from epoch 0, step 0" ) return 0, 0 # load epoch and step to resume loops epoch = client_state["epoch"] step = client_state["step"] else: # try to load the checkpoint try: checkpoint = torch.load(path) except Exception: print( "Checkpoint corrupted!" "Try to remove the last checkpoint." "Now Starting from epoch 0, step 0" ) return 0, 0 # load the model parameters and optimizer parameters # from the checkpoint epoch = checkpoint["epoch"] self.reward.model.load_state_dict(checkpoint["state_dict"]) self.optimizer.load_state_dict(checkpoint["optim_state_dict"]) self.scheduler.load_state_dict( checkpoint["scheduler_state_dict"] ) self.training_stats = checkpoint["training_stats"] step = checkpoint["step"] return epoch, step + 1 # return the next episode to train return 0, 0 def train( self, ) -> None: """Train the reward model""" print("Start Training the Reward Model") # get config parameters if self.config.deepspeed_enable: batch_size = self.train_dataloader.batch_size else: batch_size = self.config.batch_size epochs = self.config.epochs device = self.config.device iteration_per_print = self.config.iteration_per_print checkpoint_steps = self.config.checkpoint_steps # compute the number of iterations n_iter = int(len(self.train_dataset) / batch_size) # load checkpoint start_epoch, start_step = self.load_checkpoint() # counter for the checkpoint cnt_checkpoints = 1 # traing loop for epoch in range(start_epoch, epochs): self.reward.train() for i, inputs in enumerate(self.train_dataloader): # skip the steps if resuming from a checkpoint if i < start_step: continue # get the inputs input_text = inputs[0] score = inputs[1] # tokenize the input with torch.no_grad(): input_tokens = self.reward.tokenizer( input_text, return_tensors="pt", truncation=True, padding=True, ) output = torch.as_tensor( score, dtype=torch.float32, device=device ) # forward pass if self.config.deepspeed_enable: est_output = self.model_engine( input_tokens["input_ids"].to(device), input_tokens["attention_mask"].to(device), )[:, -1] else: est_output = self.reward.get_reward( input_tokens["input_ids"].to(device), input_tokens["attention_mask"].to(device), ) # compute the loss loss = self.loss_function(est_output, output) self.training_stats.training_loss.append(loss.item()) # backward pass if self.config.deepspeed_enable: self.model_engine.backward(loss) self.model_engine.step() elif self.config.accelerate_enable: self.optimizer.zero_grad() self.accelerator.backward(loss) self.optimizer.step() self.scheduler.step() else: self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.scheduler.step() # print progress if i % iteration_per_print == 0: print( f"Epoch: {epoch+1}/{epochs}, " f"Iteration: {i+1}/{n_iter}, " f"Training Loss: {loss.item()}" ) printed_est_output = [ round(float(x), 1) for x in est_output.cpu().tolist() ] print( "prediction", printed_est_output, "target", score.cpu().tolist(), ) # checkpoints saving if cnt_checkpoints % checkpoint_steps == 0: self.save_checkpoint(epoch, i, epochs, n_iter) cnt_checkpoints = 1 else: cnt_checkpoints += 1 # Validation if self.validation_flag: self.reward.eval() with torch.no_grad(): for i, (text, score) in enumerate( self.validation_dataloader ): # tokenize inputs input_tokens = self.reward.tokenizer( text, return_tensors="pt", padding=True ) input_tokens = input_tokens.to(device) # TODO: check on the length of the input tokens if # they are too many it can create problems output = torch.tensor(score, dtype=torch.float32).to( device ) # forward pass est_output = self.reward.get_reward( input_tokens["input_ids"], input_tokens["attention_mask"], ) # compute loss loss = self.loss_function(est_output, output) self.training_stats.validation_loss.append(loss.item()) # print progress if i % iteration_per_print == 0: print( f"Epoch: {epoch+1}/{epochs}, " f"Iteration: {i+1}/{n_iter}, " f"Validation Loss: {loss.item()}" ) # reset start_step after training is resumed start_step = 0 # save the model at the end of the training self.reward.save() ================================================ FILE: optimization/chatllama/chatllama/rlhf/trainer.py ================================================ import json import os import random from collections import deque, namedtuple import deepspeed import torch import torch.distributed as dist from accelerate import Accelerator from beartype import beartype from beartype.typing import Deque, List, Tuple, Union from deepspeed.runtime.engine import DeepSpeedEngine from torch.utils.data import DataLoader, Dataset from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts from chatllama.rlhf.actor import ActorModel from chatllama.rlhf.config import ( Config, ConfigActor, ConfigCritic, ConfigReward, ) from chatllama.rlhf.model_list import hf_models from chatllama.rlhf.model_loader import ModelLoader from chatllama.rlhf.reward import RewardModel, CriticModel from chatllama.rlhf.utils import TrainingStats, ConversationLog """ train() ┌─────────────────────────────┐ │ │◄─────────────────────────┐ │ │ │ │ ┌─────────────┐ │ │ │ │ user input │ │ │ learn() │ └─────┬───────┘ │ ┌────────────┴─────────────┐ │ │ │ │ │ │ │ │ │ ┌────────┐ │ │ │ │ │ ┌───│ Update │──┐ │ │ │ │ │ │ └────▲───┘ │ │ │ ┌────────▼────────────┐ │ │ │ │ │ │ │ │ Actor (LLM Model) │ │ │ │ ┌──┴───┐ │ │ │ └────────┬────────────┘ │ │ │ │ PPO │ │ │ │ │ │ │ │ └▲────▲┘ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ ┌───────▼──────┐ │ │ ┌─▼──────┴┐ ┌─┴───▼──┐ │ │ │ Reward Model │ │ │ │ Actor │ │ Critic │ │ │ └──────────────┘ │ │ └─────────┘ └────────┘ │ │ │ │ │ │ │ x Episodes └─────────────▲────────────┘ └───────────────┬─────────────┘ │ x Epochs │ store N Examples per Timestep │ ┌──────▼──────┐ │ │ │ │ │ Memories ├──────────────────────────────────┘ │ │ (update timesteps x N Examples) └─────────────┘ """ # noqa W291 def change_tokenization(tokens, tokenizer1, tokenizer2): """Change the tokenizer of the tokens Args: tokens (torch.Tensor): Tokens to be changed tokenizer1 (transformers.PreTrainedTokenizer): Tokenizer to be changed tokenizer2 (transformers.PreTrainedTokenizer): Tokenizer to be changed to Returns: encoded_tokens: Encoded tokens """ # decode tokens with torch.no_grad(): decoded_tokens = [ tokenizer1.decode(token) for i, token in enumerate(tokens) ] # remove all the pad tokens decoded_tokens = [ token.replace(tokenizer1.pad_token, "") for token in decoded_tokens ] # remove all the eos tokens decoded_tokens = [ token.replace(tokenizer1.eos_token, "") for token in decoded_tokens ] # encode the actions with critic tokenizer encoded_tokens = tokenizer2( decoded_tokens, return_tensors="pt", padding=True, truncation=True, ) return encoded_tokens ConfigType = Union[ConfigActor, ConfigReward, ConfigCritic] @beartype def check_model_family(config1: ConfigType, config2: ConfigType) -> bool: """Check if the model family is the same for the two configs the model family is specified in the config.model Args: config1 (ConfigType): First config config2 (ConfigType): Second config Returns: bool: True if the model family is the same, False otherwise """ # check if both are an hugging face models if (config1.model in hf_models) and (config2.model in hf_models): # if there is a "/" remove it from the name model_name1 = config1.model model_name2 = config2.model if "/" in model_name1: model_name1 = model_name1.split("/")[1] if "/" in model_name2: model_name2 = model_name2.split("/")[1] # check if the model family is the same return model_name1.split("-")[0] == model_name2.split("-")[0] # check if both are not an hugging face models elif (config1.model not in hf_models) and (config2.model not in hf_models): # for now they could be only LLaMA models return True else: return False class ActorCritic(torch.nn.Module): """Actor Critic class stores both the actor and the critic models and it generates values and action for given sequences during the training of the actor. Attributes: actor (ActorModel): Actor model critic (CriticModel): Critic model debug (bool): enable prints for Debugging use_same_tokenizer (bool): if True the actor and critic use the same tokenizer Methods: forward: given a sequence returns action logits and values (used to evaluate the actor during training) generate: given a sequence returns action, action logits, values sequences and sequences masks (used to generate new sequences during acting phase) """ def __init__(self, config: Config) -> None: super().__init__() self.config = config self.actor = ActorModel(config.actor) # check if critic must be initialized from reward model ModelLoader.init_critic_from_reward(config.critic) self.critic = CriticModel(config.critic) # if the actor and critic use the same tokenizer is set to True self.use_same_tokenizer = False # debug flag self.debug = config.actor.debug @beartype def load(self) -> None: """Load the model from the path. This method is not implemented since it relies on actor and critic __init__ methods to perform the loading from their respective paths then loaded. """ pass @beartype def save(self) -> None: """Save the model to the path This method is implemented to save the actor model as result of RLHF in the folder actor_rl instead of actor.save() method that saves it in the actor folder. """ # get the path to save the actor model_folder, model_name, path = ModelLoader.get_model_path( config=self.config, is_checkpoint=False, ) # save the model print(f"Saving model to {path} ...") torch.save( {"state_dict": self.actor.model.state_dict()}, path, ) # get the path to save the critic model model_folder, model_name, path = ModelLoader.get_model_path( config=self.config.critic, is_checkpoint=False, ) # save the model print(f"Saving model to {path} ...") torch.save( { "model": self.critic.model.state_dict(), "head": self.critic.head.state_dict(), }, path, ) def save_deepspeed( self, model_engine: DeepSpeedEngine, config: ConfigType, client_state: dict = None, ): """Save the deepspeed model_engine to the path This method is implemented to save the actor model as result of RLHF in the folder actor_rl instead of actor.save() method that saves it in the actor folder. Same goes for the critic model. """ # get the path to save the actor model_folder, model_name, path = ModelLoader.get_model_path( config=config, is_checkpoint=False, ) # save the model print(f"Saving model to {path} ...") model_engine.save_checkpoint( save_dir=path, client_state=client_state if client_state else {} ) @beartype def forward( self, sequences_actor: torch.Tensor, sequences_mask_actor: torch.Tensor, sequences_critic: torch.Tensor, sequences_mask_critic: torch.Tensor, action_len_actor: int, action_len_critic: int, ) -> Tuple: """Given the whole sequences, use the actor forward to get the logits for each token in the sequence and the critic forward to get the values for each generation step. Args: sequences_actor (torch.Tensor): Sequences composed of [states, actions] for the actor sequence_mask_actor (torch.Tensor): Mask for the sequences of the actor sequences_critic (torch.Tensor): Sequences composed of [states, actions] for the critic sequences_mask_critic (torch.Tensor): Mask for the sequences of the critic action_len_actor (int): Length of the actions in the sequences for the actor action_len_critic (int): Length of the actions in the sequences for the critic Returns: action_logits (torch.Tensor): Logits for the actions in the sequences values (torch.Tensor): Values for the actions in the sequences """ # use a single forward on the whole sequence # to get pi(y | x) and ignore predicted output actions_logits = self.actor.forward( sequences_actor, sequences_mask_actor ) # use the critic forward to get the values for the actions values = self.critic.forward(sequences_critic, sequences_mask_critic) # return only logits and values for the actions taken real_actions_logits = actions_logits[:, -action_len_actor:, :] real_values = values[:, -action_len_critic:] if self.debug: print("ActorCritic.forward") print("action_len_actor", action_len_actor) print("action_len_critic", action_len_critic) print("sequences_actor.shape", sequences_actor.shape) print("sequences_actor", sequences_actor) print("sequences_critic.shape", sequences_critic.shape) print("sequences_critic", sequences_critic) print("real_action_logits.shape", actions_logits.shape) print("real_action_logits", actions_logits) print("real_values.shape", values.shape) print("real_values", values) return ( real_actions_logits, real_values, ) @torch.no_grad() @beartype def generate( self, states_actor: torch.Tensor, states_mask_actor: torch.Tensor, states_critic: torch.Tensor, ) -> Tuple: """Generate actions, actions_logits, values and sequences from states Args: states_actor (torch.Tensor): States for the actor states_mask_actor (torch.Tensor): Mask for the states for the actor states_critic (torch.Tensor): States for the critic Returns: actions (torch.Tensor): Actions generated from the states actions_logits (torch.Tensor): Logits for the actions generated from the states (i.e. pi(y | x)) values (torch.Tensor): Values generated by the critic model for the actions generated by the actor (i.e. V(x)) sequences (torch.Tensor): Sequences generated from the states as [states, actions] """ # generate action sequence from the actor actions, sequences_actor = self.actor.generate( states_actor, states_mask_actor ) # create mask for the actor sequences sequences_mask_actor = ( (sequences_actor != self.actor.tokenizer.pad_token_id) .to(sequences_actor.device) .long() .detach() ) # get the length of the actions action_len_actor = actions.shape[1] # check if different encoding is needed for the critic if self.use_same_tokenizer: sequences_critic = sequences_actor sequences_mask_critic = sequences_mask_actor action_len_critic = action_len_actor else: encoded_critic = change_tokenization( sequences_actor, self.actor.tokenizer, self.critic.tokenizer, ) # split the encoded_critic in tokens and maks sequences_critic = encoded_critic["input_ids"].to( sequences_actor.device, ) sequences_mask_critic = ( encoded_critic["attention_mask"] .to(sequences_actor.device) .long() .detach() ) # compute len of actions for the critic tokenizer action_len_critic = states_critic.shape[1] # generate actions_logits and values actions_logits, values = self.forward( sequences_actor, sequences_mask_actor, sequences_critic, sequences_mask_critic, action_len_actor, action_len_critic, ) if self.debug: print("ActorCritic.generate") print("actions shape", actions.shape) print("actions", actions) print("sequence shape", sequences_actor.shape) print("sequence", sequences_actor) print("actions_logits shape", actions_logits.shape) print("actions_logits", actions_logits) print("values shape", values.shape) print("values", values) return ( actions, actions_logits, values, sequences_actor, sequences_mask_actor, sequences_critic, sequences_mask_critic, action_len_actor, action_len_critic, ) # structure to store the data for each experience Memory = namedtuple( "Memory", [ "states_actor", "actions", "values", "rewards", "actions_log_probs", "sequences_actor", "sequences_mask_actor", "sequences_critic", "sequences_mask_critic", "action_len_actor", "action_len_critic", ], ) class ExperienceDataset(Dataset): """Dataset to train the actor-critic models""" def __init__( self, memories: Deque[Memory], device: torch.device, ) -> None: super().__init__() self.data = list(memories) def __len__( self, ) -> int: return len(self.data) def __getitem__(self, idx) -> Tuple: # return the idx-th memory element as a tuple of tensors on the device item = ( self.data[idx].states_actor, self.data[idx].actions, self.data[idx].values, self.data[idx].rewards, self.data[idx].actions_log_probs, self.data[idx].sequences_actor, self.data[idx].sequences_mask_actor, self.data[idx].sequences_critic, self.data[idx].sequences_mask_critic, int(self.data[idx].action_len_actor), int(self.data[idx].action_len_critic), ) return item class ExamplesSampler: """Store the prompt to be sampled to generate the examples read a json file with the following format: [ { "user_input" : "", } , ... ] Where: user_input: is the input of the user or directly the input of the user with the memory preappended (i.e. user_input + memory) """ def __init__( self, path: str, ) -> None: self.path = path with open(path, "r") as f: data = json.load(f) self.data = [d["user_input"] for d in data] def sample(self, n: int) -> List: """Sample n examples from the data Args: n (int): Number of examples to sample """ return random.sample(self.data, n) class RLTrainer: """Train the actor-critic model using RL Attributes: config (Config): Configuration of the trainer debug (bool): Debug mode actorcritic (ActorCritic): Actor-critic model actor_optim (torch.optim): Optimizer for the actor critic_optim (torch.optim): Optimizer for the critic actor_scheduler (torch.optim.lr_scheduler): Scheduler for the actor critic_scheduler (torch.optim.lr_scheduler): Scheduler for the critic reward (RewardModel): Reward model training_stats (TrainingStats): Class to store training stats conversation_log (ConversationLog): Class to store the conversation examples_sampler (ExamplesSampler): Class to sample examples eps (float): small epsilon to avoid division by zero Methods: train: the training loop that calls the learn function after generating the experiences. learn: Learn from a batch of experiences and update the actor and the critic model. load_checkpoint: Load the checkpoint of the actor-critic model save_checkpoint: Save the checkpoint of the actor-critic model """ def __init__( self, config: Config, ) -> None: # save config self.config = config # set debug mode self.debug = config.trainer.debug # initialize agent-critic self.actorcritic = ActorCritic(config) # initialize actor optimizer self.actor_optimizer = torch.optim.Adam( self.actorcritic.actor.parameters(), lr=config.trainer.actor_lr ) # initialize critic optimizer self.critic_optimizer = torch.optim.Adam( self.actorcritic.critic.parameters(), lr=config.trainer.critic_lr ) # scheduler (defined in the learn() method (i need dataset size)) self.actor_scheduler = None self.critic_scheduler = None # initialize reward model self.reward = RewardModel(config.reward) # initialize class to store training stats path = ModelLoader.get_training_stats_path(config) self.training_stats = TrainingStats(path) model_folder, _, _ = ModelLoader.get_model_path( config, is_checkpoint=True, ) path = os.path.join(model_folder, "conversations_log.json") self.conversation_log = ConversationLog(path) # initialize examples sampler self.example_sampler = ExamplesSampler(config.trainer.examples_path) # check if actor and critic use the same tokenizer self.actorcritic.use_same_tokenizer = check_model_family( config.actor, config.critic ) # check if actor and reward use the same tokenizer self.use_same_tokenizer = check_model_family( config.actor, config.reward ) # eps self.eps = 1e-8 # deepspeed initialization self.actor_model_engine = None self.critic_model_engine = None self.is_deepspeed_init = None if ( self.config.actor.deepspeed_enable or self.config.critic.deepspeed_enable or self.config.critic.deepspeed_enable ): deepspeed.init_distributed("nccl") self.is_deepspeed_init = True os.environ["TOKENIZERS_PARALLELISM"] = "False" else: self.is_deepspeed_init = False if self.config.actor.deepspeed_enable: ( self.actor_model_engine, self.actorcritic.actor, self.actor_optimizer, ) = self.initialize_deepspeed_model( config=self.config.actor, model=self.actorcritic.actor ) if self.config.critic.deepspeed_enable: ( self.critic_model_engine, self.actorcritic.critic, self.critic_optimizer, ) = self.initialize_deepspeed_model( config=self.config.critic, model=self.actorcritic.critic ) if self.config.reward.deepspeed_enable: ( _, self.reward, _, ) = self.initialize_deepspeed_model( config=self.config.reward, model=self.reward ) @staticmethod def initialize_deepspeed_model( config: Union[ConfigActor, ConfigCritic, ConfigReward], model: torch.nn.Module, ): if config.deepspeed_config_path is None: raise ValueError("DeepSpeed config path is None, but deepspeed is enabled") if os.path.exists(config.deepspeed_config_path) is False: raise ValueError( f"DeepSpeed config path" f"{config.deepspeed_config_path}" f"does not exist" ) (model_engine, ds_optimizer, _, _,) = deepspeed.initialize( args=None, model=model, model_parameters=model.parameters(), config=config.deepspeed_config_path, ) # model_engine.module has to be returned to make custom methods # of Module accessible return model_engine, model_engine.module, ds_optimizer @beartype def save_checkpoint( self, current_episode: int, max_episode: int, ) -> None: print(f"Saving checkpoint for episode {current_episode+1}..") # get the path to save the checkpoint for the critic model_folder, model_name, path = ModelLoader.get_model_path( config=self.config.critic, is_checkpoint=True, current_epoch=current_episode, max_epochs=max_episode, max_steps=0, ) # if the checkpoint already exists remove it. # Deepspeed checkpoints are already directories and will be overwritten if os.path.exists(path) and not self.is_deepspeed_init: os.remove(path) # save the checkpoint actor_checkpoint_dict = { "episode": current_episode, "critic_state_dict": self.actorcritic.critic.state_dict(), "critic_optim_state_dict": self.critic_optimizer.state_dict(), } if self.config.actor.deepspeed_enable: # The model and optimizer state dicts are actually already saved # In the deepspeed model engine. But to make sure no depending # methods fail, the states are included in actor_checkpoint_dict. # ATTENTION: If you use deepspeed zero optimization, the client_state # will not be saved self.actor_model_engine.save_checkpoint( save_dir=path, client_state=actor_checkpoint_dict ) else: torch.save(actor_checkpoint_dict, path) # get the path to save the checkpoint for the actor model_folder, model_name, path = ModelLoader.get_model_path( config=self.config, is_checkpoint=True, current_epoch=current_episode, max_epochs=max_episode, max_steps=0, ) # if the checkpoint already exists remove it. # Deepspeed checkpoints are already directories and will be overwritten if os.path.exists(path) and not self.is_deepspeed_init: os.remove(path) # save the checkpoint critic_checkpoint_dict = { "episode": current_episode, "actor_state_dict": self.actorcritic.actor.state_dict(), "actor_optim_state_dict": self.actor_optimizer.state_dict(), "training_stats": self.training_stats, } if self.config.critic.deepspeed_enable: # The model and optimizer state dicts are actually already saved # In the deepspeed model engine. But to make sure no depending # methods fail, the states are included in critic_checkpoint_dict. # ATTENTION: If you use deepspeed zero optimization, the client_state # will not be saved self.critic_model_engine.save_checkpoint( save_dir=path, client_state=critic_checkpoint_dict ) else: torch.save(critic_checkpoint_dict, path) @beartype def load_checkpoint( self, ) -> int: critic_episode = -1 actor_episode = -1 # check if there are some checkpoint for the critic print("Looking for checkpoints...") path = ModelLoader.check_model_path( config=self.config.critic, is_checkpoint=True, current_epoch=None, ) # if there are checkpoint if path is not None: # load the critic checkpoint print("Loading ...") try: checkpoint = torch.load(path) except Exception: print( "Checkpoint of critic corrupted!" "Try to remove the last checkpoint." "Now Starting from episode 0" ) return 0 # load checkpoint into model critic_episode = checkpoint["episode"] self.actorcritic.critic.load_state_dict( checkpoint["critic_state_dict"] ) self.critic_optimizer.load_state_dict( checkpoint["critic_optim_state_dict"] ) # check if there are checkpoints for the actor print("Looking for checkpoints...") path = ModelLoader.check_model_path( config=self.config, is_checkpoint=True, current_epoch=None, ) # if there are some checkpoints if path is not None: # load the actor checkpoint print("Loading ...") try: checkpoint = torch.load(path) except Exception: print( "Checkpoint of actor corrupted!" "Try to remove the last checkpoint." "Now Starting from episode 0" ) return 0 # load checkpoint into the model actor_episode = checkpoint["episode"] self.actorcritic.actor.load_state_dict( checkpoint["actor_state_dict"] ) self.actor_optimizer.load_state_dict( checkpoint["actor_optim_state_dict"] ) self.training_stats = checkpoint["training_stats"] # check if there are some discrepancies between the checkpoints if critic_episode == actor_episode: # all ok start from next episode return critic_episode + 1 else: print( f"There are some discrepancies between the checkpoints" f"of actor and critic \nactor episode: {actor_episode}" f"\n critic episode: {critic_episode}\n" ) return min(critic_episode, actor_episode) + 1 @beartype def learn(self, memories: Deque[Memory]) -> None: """Train the agent-critic model using RL: - for each batch of episodes, compute action logits and values - then compare action logits probs with memories one and values with rewards to compute the PPO loss and update the actor-critic model """ print("Start to Learn...") # get parameters epochs = self.config.trainer.epochs actor_eps_clip = self.config.trainer.actor_eps_clip critic_eps_clip = self.config.trainer.critic_eps_clip beta_s = self.config.trainer.beta_s batch_size = self.config.trainer.batch_size device = ( torch.device(f"cuda:{dist.get_rank()}") if self.is_deepspeed_init else self.config.trainer.device ) # create dataset from memories dataset = ExperienceDataset(memories, device) if self.is_deepspeed_init: engine = self.actor_model_engine or self.critic_model_engine dataloader = engine.deepspeed_io(dataset) else: dataloader = DataLoader(dataset, batch_size=batch_size) # initialize scheduler for actor actor_lr = self.config.trainer.actor_lr # This lr_scheduler is not available in deepspeed # see https://deepspeed.readthedocs.io/en/latest/schedulers.html if not self.is_deepspeed_init: self.actor_scheduler = CosineAnnealingWarmRestarts( self.actor_optimizer, T_0=len(dataset), eta_min=actor_lr * 0.1 ) # initialize scheduler for critic critic_lr = self.config.trainer.critic_lr # This lr_scheduler is not available in deepspeed # see https://deepspeed.readthedocs.io/en/latest/schedulers.html if not self.is_deepspeed_init: self.critic_scheduler = CosineAnnealingWarmRestarts( self.critic_optimizer, T_0=len(dataset), eta_min=critic_lr * 0.1 ) # initialize actor accelerate if self.config.actor.accelerate_enable is True: actor_accelerator = Accelerator() ( actor_model, self.actor_optimizer, self.train_dataloader, self.actor_scheduler, ) = actor_accelerator.prepare( self.actorcritic.actor, self.actor_optimizer, self.train_dataloader, self.actor_scheduler, ) self.actorcritic.actor = actor_model # initialize critic accelerate if self.config.critic.accelerate_enable is True: critic_accelerator = Accelerator() ( critic_model, self.critic_optimizer, self.critic_scheduler, ) = critic_accelerator.prepare( self.actorcritic.critic, self.critic_optimizer, self.critic_scheduler, ) self.actorcritic.critic = critic_model # train agent-critic self.actorcritic.train() for epoch in range(epochs): for k, batch in enumerate(dataloader): ( states_actor, old_actions, old_values, rewards, old_actions_log_probs, sequences_actor, sequences_mask_actor, sequences_critic, sequences_mask_critic, action_len_actor, action_len_critic, ) = [tensor.to(device) for tensor in batch] if self.debug: print( f"#########################################" f" batch from memories {k} \n " f"#########################################" f"states_actor {states_actor.shape} \n" f"old_actions {old_actions.shape} \n" f"old_values {old_values.shape} \n" f"rewards {rewards.shape} \n" f"old_actions_log_probs " f"{old_actions_log_probs.shape}\n" f"sequences_actor {sequences_actor.shape} \n" f"sequences_mask_actor " f"{sequences_mask_actor.shape} \n" f"sequences_critic {sequences_critic.shape} \n" f"sequences_mask_critic " f"{sequences_mask_critic.shape} \n" f"action_len_actor {action_len_actor} \n" f"action_len_critic {action_len_critic} \n" f"#########################################" ) # get actor critic new probabilities and values actions_logits, values = self.actorcritic.forward( sequences_actor, sequences_mask_actor, sequences_critic, sequences_mask_critic, action_len_actor.item(), action_len_critic.item(), ) # get action log prob actions_prob = ( torch.softmax(actions_logits, dim=-1).max(dim=-1).values ) actions_log_prob = torch.log(actions_prob + self.eps) # compute entropy entropies = (actions_prob * actions_log_prob).sum(dim=-1) # compute KL divergence kl_div_loss = ( (actions_prob * (old_actions_log_probs - actions_log_prob)) .sum(dim=-1) .mean() ) # compute ratios ratios = (actions_log_prob - old_actions_log_probs).exp() # compute PPO loss if check_model_family(self.config.actor, self.config.critic): # compute discounted rewards as in TRL gamma = self.config.trainer.gamma_discounted discounted_rewards = torch.zeros_like(old_values) for i in range(discounted_rewards.shape[1]): for j in range(i, discounted_rewards.shape[1]): discounted_rewards[:, i] += ( gamma ** (j - i) * rewards[:, j] ) advantages = ( discounted_rewards - old_values ) # TRL has opposite sign for old values advantages = (advantages - advantages.mean(dim=-1)) / ( advantages.std() + self.eps ) surr1 = advantages * ratios else: advantages = rewards - old_values[:, -1] surr1 = advantages * ratios surr2 = ( torch.clamp(ratios, 1 - actor_eps_clip, 1 + actor_eps_clip) * advantages ) policy_loss = -torch.min(surr1, surr2) - beta_s * entropies policy_loss = policy_loss.mean() loss = policy_loss + kl_div_loss # check if loss item is NaN if torch.isnan(loss): raise ValueError("Loss is nan") # update actor with loss if self.config.actor.deepspeed_enable: self.actor_model_engine.backward(loss) self.actor_model_engine.step() elif self.config.actor.accelerate_enable: self.actor_optimizer.zero_grad() actor_accelerator.backward(loss) self.actor_optimizer.step() self.actor_scheduler.step() else: self.actor_optimizer.zero_grad() loss.backward() self.actor_optimizer.step() self.actor_scheduler.step() # compute value loss # the loss is the distance between the rewards and the values # I want this distance to be small so that values are # representative of the rewards, for this reason i took the # maximum between the two. # The clip is limiting the slew-rate of values_loss_clipped value_loss_clipped = old_values + (values - old_values).clamp( -critic_eps_clip, critic_eps_clip ) value_loss1 = (value_loss_clipped - rewards) ** 2 value_loss2 = (values - rewards) ** 2 value_loss = torch.max(value_loss1, value_loss2).mean() if torch.isnan(value_loss): raise ValueError("Value loss is nan") # upate critic if self.config.critic.deepspeed_enable: self.critic_model_engine.backward(value_loss) self.critic_model_engine.step() elif self.config.critic.accelerate_enable: self.critic_optimizer.zero_grad() critic_accelerator.backward(loss) self.critic_optimizer.step() self.critic_scheduler.step() else: self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step() self.critic_scheduler.step() # append the losses to the training stats self.training_stats.training_loss.append( loss.detach().cpu().item() ) self.training_stats.value_loss.append( value_loss.detach().cpu().item() ) # print iteration info print( f"Epoch {epoch+1}/{epochs}", f"Step {k+1}/{int(len(dataloader) / batch_size)}", f"Loss {loss.detach().cpu().item():.4f}", f"Value Loss {value_loss.detach().cpu().item():.4f}", ) self.actorcritic.eval() print("End Learning") def train( self, ) -> None: print("Start RL Training") # initialize settings num_episodes = self.config.trainer.num_episodes max_timesteps = self.config.trainer.max_timesteps num_examples = self.config.trainer.num_examples update_timesteps = self.config.trainer.update_timesteps batch_size = self.config.trainer.batch_size checkpoint_steps = self.config.trainer.checkpoint_steps device = ( torch.device(f"cuda:{dist.get_rank()}") if self.is_deepspeed_init else self.config.trainer.device ) # number of elements that the memories should contain when learning number_of_memories_per_learn_iteration = ( num_examples * update_timesteps ) # the number of memories must be a multiple of the batch size assert ( number_of_memories_per_learn_iteration % batch_size == 0 ), "The number of memories must be a multiple of the batch size" # the total number of timesteps done in the train() are total_number_of_timesteps = num_episodes * max_timesteps # the total timesteps done should be a multiple of the update timesteps assert total_number_of_timesteps % update_timesteps == 0, ( "The number of timesteps (num_episodes*max_timesteps)" "must be a multiple of the update_timesteps" ) # initialize memories memories = deque([]) # load checkpoint start_episode = self.load_checkpoint() # if it is a new training from the start clear the conversation log if start_episode == 0: self.conversation_log.clear() # initialize counters cnt_timesteps = 0 cnt_learn_iter = 0 # loop over episodes and timesteps self.actorcritic.eval() for episode in range(start_episode, num_episodes): for timestep in range(max_timesteps): # print the iteration info print( f"Episode: {episode + 1}/{num_episodes}, " f"Timestep: {timestep + 1}/{max_timesteps}", f"Learning Cnt: {cnt_timesteps + 1}/{update_timesteps}", ) # counter used to count timesteps into memory cnt_timesteps += 1 # sample num_examples examples from example dataset inputs = self.example_sampler.sample(num_examples) # tokenize examples for the actor tok_inputs_act = self.actorcritic.actor.tokenizer( inputs, padding=True, return_tensors="pt", truncation=True ) # states are [batch_size, seq_len_of_states] states_actor = tok_inputs_act["input_ids"].to(device) states_mask_actor = tok_inputs_act["attention_mask"].to(device) # tokenize examples for the critic tok_inputs_crt = self.actorcritic.critic.tokenizer( inputs, padding=True, return_tensors="pt", truncation=True ) # states are [batch_size, seq_len_of_states] states_critic = tok_inputs_crt["input_ids"].to(device) # generate sequences of actions and values ( actions, actions_logits, values, sequences_actor, sequences_mask_actor, sequences_critic, sequences_mask_critic, action_len_actor, action_len_critic, ) = self.actorcritic.generate( states_actor, states_mask_actor, states_critic ) # compute action log probs action_prob = ( torch.softmax(actions_logits, dim=-1).max(dim=-1).values ) actions_log_probs = torch.log(action_prob + self.eps) # get tokenized sequence for the reward models if self.use_same_tokenizer: reward_sequence = sequences_actor reward_mask = sequences_mask_actor elif check_model_family( self.config.critic, self.config.reward ): reward_sequence = sequences_critic reward_mask = sequences_mask_critic else: tokenized_responses = change_tokenization( sequences_actor, self.actorcritic.actor.tokenizer, self.reward.tokenizer, ) # get tokens and mask reward_sequence = tokenized_responses["input_ids"].to( device ) reward_mask = tokenized_responses["attention_mask"].to( device ) # compute rewards rewards = self.reward.forward( reward_sequence, reward_mask, ) rewards = rewards[:, -action_len_critic:] reward = rewards[:, -1] # store memories of the episode / timestep for i in range(states_actor.shape[0]): memories.append( Memory( states_actor[i, :].detach().cpu(), actions[i, :].detach().cpu(), values[i, :].detach().cpu(), rewards[i, :].detach().cpu(), actions_log_probs[i, :].detach().cpu(), sequences_actor[i, :].detach().cpu(), sequences_mask_actor[i, :].detach().cpu(), sequences_critic[i, :].detach().cpu(), sequences_mask_critic[i, :].detach().cpu(), int(action_len_actor), int(action_len_critic), ) ) # decode completions to be logged in the conversation log completions = [ self.actorcritic.actor.tokenizer.decode(action) for action in actions ] # remove pad tokens from completions completions = [ c.replace(self.actorcritic.actor.tokenizer.pad_token, "") for c in completions ] # remove eos tokens from completions completions = [ c.replace(self.actorcritic.actor.tokenizer.eos_token, "") for c in completions ] # strange i need to force this? completions = [c.replace("", "") for c in completions] # log the memories in the conversation log for i in range(states_actor.shape[0]): self.conversation_log.append( inputs[i], completions[i], reward[i].detach().cpu().item(), cnt_learn_iter, ) # learn from memories if (cnt_timesteps % update_timesteps == 0) and ( cnt_timesteps != 0 ): print("len memories", len(memories)) if not self.is_deepspeed_init or (dist.get_rank() == 0): self.conversation_log.save() self.learn(memories) mean_reward = sum([m.rewards[-1] for m in memories]) / len( memories ) print(f"Mean Reward: {mean_reward}") memories.clear() cnt_timesteps = 0 cnt_learn_iter += 1 if not self.is_deepspeed_init or (dist.get_rank() == 0): self.conversation_log.save() # save checkpoints if (episode % checkpoint_steps == 0) and (episode != 0): self.save_checkpoint( current_episode=episode, max_episode=num_episodes ) if not self.is_deepspeed_init or (dist.get_rank() == 0): self.conversation_log.save() # save the models if self.is_deepspeed_init: self.actorcritic.save_deepspeed(self.actor_model_engine, self.config) self.actorcritic.save_deepspeed( self.critic_model_engine, self.config.critic ) else: self.actorcritic.save() print("End RL Training") ================================================ FILE: optimization/chatllama/chatllama/rlhf/utils.py ================================================ import json import os from beartype import beartype from plotly import graph_objects as go class TrainingStats: """Training statistics Attributes: training_loss (List): List of training losses training_accuracy (List): List of training accuracies value_loss (List): List of value losses validation_loss (List): List of validation losses validation_accuracy (List): List of validation accuracies """ def __init__(self, path: str): """Initialize the training stats Args: path (str): Path to save the stats """ self.training_loss = [] self.training_accuracy = [] self.value_loss = [] self.validation_loss = [] self.validation_accuracy = [] self.path = path def plot(self): """Plot the training statistics using plotly""" fig = go.Figure() if len(self.training_loss) > 0: fig.add_trace( go.Scatter(y=self.training_loss, name="Training loss") ) if len(self.training_accuracy) > 0: fig.add_trace( go.Scatter(y=self.training_accuracy, name="Training accuracy") ) if len(self.value_loss) > 0: fig.add_trace(go.Scatter(y=self.value_loss, name="Value loss")) if len(self.validation_loss) > 0: fig.add_trace( go.Scatter(y=self.validation_loss, name="Validation loss") ) if len(self.validation_accuracy) > 0: fig.add_trace( go.Scatter( y=self.validation_accuracy, name="Validation accuracy" ) ) fig.update_layout( showlegend=True, xaxis_type="log", xaxis_title="steps" ) fig.show() def save( self, ): """Save the stats""" if os.path.exists(self.path): with open(self.path, "r") as f: stats = json.load(f) stats["training_loss"].extend(self.training_loss) stats["training_accuracy"].extend(self.training_accuracy) stats["value_loss"].extend(self.value_loss) stats["validation_loss"].extend(self.validation_loss) stats["validation_accuracy"].extend(self.validation_accuracy) else: stats = { "training_loss": self.training_loss, "training_accuracy": self.training_accuracy, "value_loss": self.value_loss, "validation_loss": self.validation_loss, "validation_accuracy": self.validation_accuracy, } with open(self.path, "w") as f: json.dump(stats, f, indent=4) def load( self, ): """Load the stats""" with open(self.path, "r") as f: stats = json.load(f) self.training_loss = stats["training_loss"] self.training_accuracy = stats["training_accuracy"] self.value_loss = stats["value_loss"] self.validation_loss = stats["validation_loss"] self.validation_accuracy = stats["validation_accuracy"] def clear( self, ): """Clear the stats""" self.training_loss = [] self.training_accuracy = [] self.value_loss = [] self.validation_loss = [] self.validation_accuracy = [] if os.path.exists(self.path): os.remove(self.path) class ConversationLog: """Save the conversation: (user input, model output, rewards and learn_counter) during the RL training loop. """ def __init__(self, path: str): self.conversation = [] self.path = path if self.path is None: self.path = "./convesation_log.json" @beartype def append( self, user_input: str, model_output: str, reward: float, learn_counter: int, ): """Add a conversation to the log Args: user_input (str): User input / initial prompt model_output (str): Completion of the LLM model reward (float): Reward of the reward model assigned to the output learn_counter (int): Number of the learning iteration to distinguish the conversations that happens at different points of the training loopt """ self.conversation.append( { "user_input": user_input, "model_output": model_output, "reward": reward, "learn_counter": learn_counter, } ) def save(self): print("Saving conversations log") if os.path.exists(self.path): with open(self.path, "r") as f: conversation = json.load(f) self.conversation.extend(conversation) self.conversation = sorted( self.conversation, key=lambda x: float(x["learn_counter"]) ) with open(self.path, "w") as f: json.dump(self.conversation, f, indent=4) def load(self): with open(self.path, "r") as f: self.conversation = json.load(f) def clear(self): print("Clearing conversations log") self.conversation = [] # remove the file in path exists if os.path.exists(self.path): os.remove(self.path) def show(self, current_iteration: int = None): """Show the conversation log Args: current_iteration (int): Current iteration of the training loop, if not None, print only the conversations that happened at """ for i, c in enumerate(self.conversation): if current_iteration is None: print( f"##########################################\n" f"Conversation {i} at learn_counter " f"{c['learn_counter']}\n" f"##########################################\n" f"## User Input:\n\n{c['user_input']}\n\n" f"## Model Output:\n\n{c['model_output']}\n\n" f"## Reward: {c['reward']}\n\n" ) else: if current_iteration == c["learn_counter"]: print( f"##########################################\n" f"Conversation {i} at learn_counter " f"{c['learn_counter']}\n" f"##########################################\n" f"## User Input:\n\n{c['user_input']}\n\n" f"## Model Output:\n\n{c['model_output']}\n\n" f"## Reward: {c['reward']}\n\n" ) ================================================ FILE: optimization/chatllama/setup.py ================================================ from pathlib import Path from setuptools import setup, find_packages REQUIREMENTS = [ "accelerate", "beartype", "deepspeed", "einops", "fairscale", "langchain>=0.0.103", "torch", "tqdm", "transformers", "datasets", "openai", "plotly", "peft" ] this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text(encoding="utf8") setup( name="chatllama-py", version="0.0.4", packages=find_packages(), install_requires=REQUIREMENTS, long_description=long_description, include_package_data=True, long_description_content_type="text/markdown", ) ================================================ FILE: optimization/cloud_surfer/README.md ================================================ # 🏄 CloudSurfer (WIP) Automatically discover the optimal cloud configuration and hardware on AWS, GCP and Azure to run your AI models. If you like this module, give us a star to show your support for the project ⭐ ## 📚 Description The CloudSurfer module allows users to automatically compare the inference performance of their deep learning model across hardware and cloud providers. It leverages state-of-the-art optimization techniques to custom-accelerate the models on each platform, providing the user with an accurate benchmark of their model performances in terms of speed, accuracy, and cost. With CloudSurfer, users can input their model in their preferred deep learning framework and express their preferences for accuracy and performance. The library will then automatically test the model on a range of hardware and cloud platforms, using optimization techniques to ensure that the results are accurate and representative of the model's performances. Users can then compare the results side-by-side, seeing the performance of their model on different hardware and cloud providers. This is key to make informed decisions about which platform (cloud and hardware type) to pick, without having to guess or rely on outdated information. Overall, CloudSurfer provides a powerful and easy-to-use tool to optimize deep learning models and to choose the best inference hardware and cloud platform. Try it out today, and reach out if you have any feedback! ================================================ FILE: optimization/forward_forward/README.md ================================================ # Forward-Forward Algorithm This module implements a complete open-source version of [Geoffrey Hinton's Forward Forward](https://www.cs.toronto.edu/~hinton/FFA13.pdf) Algorithm, an alternative approach to backpropagation. The Forward Forward algorithm is a method for training deep neural networks that replaces the backpropagation forward and backward passes with two forward passes, one with positive (i.e., real) data and the other with negative data that could be generated by the network itself. Unlike the backpropagation approach, Forward-Forward does not require calculating the gradient of the loss function with respect to the network parameters. Instead, each optimization step can be performed locally and the weights of each layer can be updated immediately after the layer has performed its forward pass. If you appreciate the project, show it by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers) Screenshot 2022-12-20 at 14 45 22 ## Installation The forward-forward module is built on top of nebullvm, a framework for efficiency-based modules. The library can be easily installed from source code. First you have to clone the repository and navigate to the app directory: ```bash git clone https://github.com/nebuly-ai/nebullvm.git cd nebullvm/apps/accelerate/forward_forward ``` Then install the module: ```bash pip install . ``` This process will just install the minimum requirements for running the module. If you want to run the module on a GPU you have to install the CUDA version of PyTorch. You can find the instructions on the official PyTorch website. ## Usage At the current stage, this implementation supports the main architectures discussed by Hinton in his paper. Each architecture can be trained with the following command: ```python from forward_forward import train_with_forward_forward_algorithm import os import torch device = "cuda" if torch.cuda.is_available() else "cpu" trained_model = train_with_forward_forward_algorithm( model_type="progressive", n_layers=3, hidden_size=2000, lr=0.03, device=device, epochs=100, batch_size=5000, theta=2., ) ``` Three architectures are currently supported: * `progressive`: the most simple architecture described in the paper. It has a pipeline-like structure and each layer can be trained independently from the following ones. Our implementation differs respect the original one since the labels are injected in the image concatenating them to the flattened tensor instead of replacing the first n_classes pixels value with a one-hot-representation of the label. * `recurrent`: the recurrent architecture described in the paper. It has a recurrent-like structure and its based on the `GLOM` architecture proposed by Hinton. * `nlp`: A simple network which can be used as a language model. The recurrent and nlp network architectures are better explained below. ## Recurrent Architecture The recurrent architecture is based in the `GLOM` architecture for videos, proposed by Hinton in the paper [How to represent part-whole hierarchies in a neural network](https://arxiv.org/pdf/2102.12627.pdf). Its application to the forward-forward algorithm aims at enabling each layer to learn not just from the previous layer output, but from the following layers as well. This is done by concatenating the outputs of the previous layer and following layers computed at the previous time-step. A learned representation of the label (positive or negative) it is given as input to the last layer. The following figure shows the structure of the network:

recurrent_net

## NLP Architecture The forward-forward architecture developed for NLP is a simple network which can be used as a language model. The network is composed by few normalized fully connected layers followed by a ReLU activation. All hidden representations are then concatenated together and given as input to the softmax for predicting the next token. The network can be trained in a progressive way, i.e. each layer can be sequentially trained separately from the following ones. The following figure shows the structure of the network:

nlp_net

## What is missing This app implements the main architectures exposed by hinton in its paper. However, there are still some features that are not implemented yet. In particular, the following features are missing: * [ ] Implementation of unsupervised training. * [ ] Implementation of the `progressive` architecture using local receptive fields instead of fully connected layers. * [ ] Training on CIFAR-10 for CV-based architectures. And don't forget to [leave a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers) if you appreciate the project! If you have any questions about the implementation, [open an issue](https://github.com/nebuly-ai/nebullvm/issues) or contact us in the [community chat](https://discord.gg/RbeQMu886J). ## Contributing We welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the linked page for more information on how to get involved. A special thanks to [Additi Pandey](https://github.com/cyclotomicextension) for her amazing contribution to the Forward-Forward module. ================================================ FILE: optimization/forward_forward/forward_forward/__init__.py ================================================ from forward_forward.api.functions import ( # noqa F401 train_with_forward_forward_algorithm, ) ================================================ FILE: optimization/forward_forward/forward_forward/api/__init__.py ================================================ ================================================ FILE: optimization/forward_forward/forward_forward/api/functions.py ================================================ from torchvision import datasets from forward_forward.root_op import ( ForwardForwardRootOp, ForwardForwardModelType, ) def train_with_forward_forward_algorithm( n_layers: int = 2, model_type: str = "progressive", device: str = "cpu", hidden_size: int = 2000, lr: float = 0.03, epochs: int = 100, batch_size: int = 5000, theta: float = 2.0, shuffle: bool = True, **kwargs, ): model_type = ForwardForwardModelType(model_type) root_op = ForwardForwardRootOp(model_type) output_size = None if model_type is ForwardForwardModelType.PROGRESSIVE: input_size = 28 * 28 + len(datasets.MNIST.classes) elif model_type is ForwardForwardModelType.RECURRENT: input_size = 28 * 28 output_size = len(datasets.MNIST.classes) else: # model_type is ForwardForwardModelType.NLP input_size = 10 # number of characters output_size = 30 # length of vocabulary assert ( kwargs.get("predicted_tokens") is not None ), "predicted_tokens must be specified for NLP model" root_op.execute( input_size=input_size, n_layers=n_layers, hidden_size=hidden_size, optimizer_name="Adam", optimizer_params={"lr": lr}, loss_fn_name="alternative_loss_fn", batch_size=batch_size, epochs=epochs, device=device, shuffle=shuffle, theta=theta, output_size=output_size, ) return root_op.get_result() ================================================ FILE: optimization/forward_forward/forward_forward/app.py ================================================ from nebullvm.apps.base import App from forward_forward.root_op import ForwardForwardRootOp class ForwardForwardApp(App): def __init__(self): super().__init__() self.root_op = ForwardForwardRootOp() def execute(self, *args, **kwargs): return self.root_op.execute(*args, **kwargs) ================================================ FILE: optimization/forward_forward/forward_forward/operations/__init__.py ================================================ ================================================ FILE: optimization/forward_forward/forward_forward/operations/build_models.py ================================================ from abc import ABC, abstractmethod import torch from nebullvm.operations.base import Operation from forward_forward.utils.modules import ( FCNetFFProgressive, RecurrentFCNetFF, LMFFNet, ) class BaseModelBuildOperation(Operation, ABC): def __init__(self): super().__init__() self.model = None @abstractmethod def execute( self, input_size: int, n_layers: int, hidden_size: int, optimizer_name: str, optimizer_params: dict, loss_fn_name: str, output_size: int = None, ): raise NotImplementedError def get_result(self): return self.model class FCNetFFProgressiveBuildOperation(BaseModelBuildOperation): def __init__(self): super().__init__() def execute( self, input_size: int, n_layers: int, hidden_size: int, optimizer_name: str, optimizer_params: dict, loss_fn_name: str, output_size: int = None, ): layer_sizes = [input_size] + [hidden_size] * n_layers model = FCNetFFProgressive( layer_sizes=layer_sizes, optimizer_name=optimizer_name, optimizer_kwargs=optimizer_params, loss_fn_name=loss_fn_name, epochs=-1, ) if output_size is not None: output_layer = torch.nn.Linear(layer_sizes[-1], output_size) model = torch.nn.Sequential(model, output_layer) self.model = model class RecurrentFCNetFFBuildOperation(BaseModelBuildOperation): def __init__(self): super().__init__() def execute( self, input_size: int, n_layers: int, hidden_size: int, optimizer_name: str, optimizer_params: dict, loss_fn_name: str, output_size: int = None, ): layer_sizes = [input_size] + [hidden_size] * n_layers + [output_size] model = RecurrentFCNetFF( layer_sizes=layer_sizes, optimizer_name=optimizer_name, optimizer_kwargs=optimizer_params, loss_fn_name=loss_fn_name, ) self.model = model class LMFFNetBuildOperation(BaseModelBuildOperation): def __init__(self): super().__init__() def execute( self, input_size: int, n_layers: int, hidden_size: int, optimizer_name: str, optimizer_params: dict, loss_fn_name: str, output_size: int = None, ): model = LMFFNet( token_num=output_size, hidden_size=hidden_size, n_layers=n_layers, seq_len=input_size, optimizer_name=optimizer_name, optimizer_kwargs=optimizer_params, loss_fn_name=loss_fn_name, epochs=-1, predicted_tokens=-1, ) self.model = model ================================================ FILE: optimization/forward_forward/forward_forward/operations/data.py ================================================ import urllib.request from typing import Any import torch import torch.utils.data from nebullvm.operations.base import Operation from torchvision import datasets, transforms class MNISTDataLoaderOperation(Operation): """DataLoaderOperation""" def __init__(self): super().__init__() self.train_data = None self.test_data = None def get_result(self) -> Any: if self.train_data is not None: return self.train_data, self.test_data else: return None def execute(self, batch_size: int, shuffle: bool): train_loader = torch.utils.data.DataLoader( datasets.MNIST( "data", train=True, download=True, transform=transforms.Compose( [ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)), ] ), ), batch_size=batch_size, shuffle=shuffle, ) test_loader = torch.utils.data.DataLoader( datasets.MNIST( "data", train=False, transform=transforms.Compose( [ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)), ] ), ), batch_size=1000, shuffle=False, ) self.train_data = train_loader self.test_data = test_loader def download_fables(): http_str = "http://classics.mit.edu/Aesop/fab.mb.txt" with urllib.request.urlopen(http_str) as response: html = response.read() return html.decode("utf-8") def get_fables(): fables = download_fables() fables = fables.split("SECTION 1")[1] fables = fables.split("THE END")[0] fables = fables.split("\n\n") fables = [fable for fable in fables if len(fable) >= 100] return fables VOCABULARY = { " ": 0, "!": 1, ",": 2, ".": 3, "a": 4, "b": 5, "c": 6, "d": 7, "e": 8, "f": 9, "g": 10, "h": 11, "i": 12, "j": 13, "k": 14, "l": 15, "m": 16, "n": 17, "o": 18, "p": 19, "q": 20, "r": 21, "s": 22, "t": 23, "u": 24, "v": 25, "w": 26, "x": 27, "y": 28, "z": 29, } def tokenize(fable, max_len=100): tokenized_fable = [ VOCABULARY[char] for i, char in enumerate(fable.lower()) if char in VOCABULARY ] return tokenized_fable[:max_len] def get_tokenized_fables(): fables = get_fables() tokenized_fables = [tokenize(fable) for fable in fables] tokenized_fables = torch.stack( [ torch.tensor(tokens) for tokens in tokenized_fables if len(tokens) == 100 ] ) return tokenized_fables def get_dataloader(batch_size=32, test_size=0.2, shuffle=True): tokenized_fables = get_tokenized_fables() n_test = int(len(tokenized_fables) * test_size) test_set = torch.utils.data.TensorDataset(tokenized_fables[:n_test]) train_set = torch.utils.data.TensorDataset(tokenized_fables[n_test:]) train_loader = torch.utils.data.DataLoader( train_set, batch_size=batch_size, shuffle=shuffle ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=n_test, shuffle=False ) return train_loader, test_loader class AesopFablesDataLoaderOperation(Operation): """DataLoaderOperation""" def __init__(self): super().__init__() self.train_data = None self.test_data = None def get_result(self) -> Any: if self.train_data is not None: return self.train_data, self.test_data else: return None def execute(self, batch_size: int, shuffle: bool): train_loader, test_loader = get_dataloader( batch_size=batch_size, test_size=0.2, shuffle=shuffle ) self.train_data = train_loader self.test_data = test_loader ================================================ FILE: optimization/forward_forward/forward_forward/operations/fetch_operations.py ================================================ from typing import Any from nebullvm.operations.base import Operation from torch.utils.data import DataLoader class FetchTrainingDataFromLocal(Operation): def get_result(self) -> Any: pass def execute(self, train_data: DataLoader, test_data: DataLoader): self.state["train_data"] = train_data self.state["test_data"] = test_data def get_train_data(self) -> DataLoader: return self.state.get("train_data") def get_test_data(self) -> DataLoader: return self.state.get("test_data") ================================================ FILE: optimization/forward_forward/forward_forward/operations/trainers.py ================================================ from abc import ABC, abstractmethod import torch from nebullvm.operations.base import Operation from nebullvm.operations.fetch_operations.local import FetchModelFromLocal from torch.utils.data import DataLoader from torchvision import datasets from forward_forward.operations.data import VOCABULARY from forward_forward.operations.fetch_operations import ( FetchTrainingDataFromLocal, ) from forward_forward.utils.labels import LabelsInjector from forward_forward.utils.modules import FCNetFFProgressive from forward_forward.utils.utils import ( ProgressiveTrainingDataset, compute_perplexity, ) class BaseForwardForwardTrainer(Operation, ABC): def __init__(self): super().__init__() self.model = None self.train_data = None self.test_data = None self.fetch_model_op = FetchModelFromLocal() self.fetch_data_op = FetchTrainingDataFromLocal() def get_result(self): if self.state.get("model_is_trained"): return self.model def execute( self, model: FCNetFFProgressive, train_data: DataLoader, test_data: DataLoader, epochs: int, theta: float, device: str, **kwargs, ): if self.fetch_model_op.get_model() is None: self.fetch_model_op.execute(model) if self.fetch_data_op.get_train_data() is None: self.fetch_data_op.execute(train_data, test_data) self.model = self.fetch_model_op.get_model() self.train_data = self.fetch_data_op.get_train_data() self.test_data = self.fetch_data_op.get_test_data() if ( self.model is not None and self.train_data is not None and self.test_data is not None ): self._train(epochs, theta, device, **kwargs) @abstractmethod def _train(self, *args, **kwargs): raise NotImplementedError class ForwardForwardTrainer(BaseForwardForwardTrainer): def _train(self, epochs: int, theta: float, device: str, **kwargs): # Define model model = self.model.to(device) model.epochs = epochs batch_size = self.train_data.batch_size # TODO: SELECT THE N_CLASSES OUTSIDE THE OPERATION label_injector = LabelsInjector(datasets.MNIST.classes) progressive_dataset = ProgressiveTrainingDataset( (label_injector.inject_train(x, y) for x, y in self.train_data) ) progressive_dataloader = torch.utils.data.DataLoader( progressive_dataset, batch_size=2 * batch_size, shuffle=False ) model.train() model.progressive_train(progressive_dataloader, theta) model.eval() correct = 0 with torch.no_grad(): for data, target in self.test_data: input_data = label_injector.inject_eval(data) input_data = input_data.to(device) target = target.to(device) input_shapes = input_data.shape[:-1] input_data = input_data.reshape(-1, input_data.shape[-1]) _, prob = model.positive_eval(input_data, theta) prob = prob.reshape(*input_shapes) pred = prob.argmax(dim=1) correct += (pred == target).float().sum().item() if isinstance(correct, torch.Tensor): correct = correct.item() self.logger.info( "Test set: Accuracy: {}/{} ({:.0f}%)".format( correct, len(self.test_data.dataset), 100.0 * correct / len(self.test_data.dataset), ) ) class RecurrentForwardForwardTrainer(BaseForwardForwardTrainer): def _train(self, epochs: int, theta: float, device: str, **kwargs): model = self.model.to(device) for epoch in range(epochs): accumulated_goodness = None model.train() for j, (data, target) in enumerate(self.train_data): # TODO: THE IMAGE SHAPE SHOULD NOT BE DEFINED HERE data = data.to(device).reshape(-1, 28 * 28) target = torch.functional.F.one_hot( target.to(device), num_classes=len(datasets.MNIST.classes), ) _, goodness = model.ff_train(data, target, theta) if accumulated_goodness is None: accumulated_goodness = goodness else: accumulated_goodness[0] += goodness[0] accumulated_goodness[1] += goodness[1] goodness_ratio = ( accumulated_goodness[0] - accumulated_goodness[1] ) / abs(max(accumulated_goodness)) self.logger.info(f"Epoch {epoch + 1}") self.logger.info(f"Accumulated goodness: {accumulated_goodness}") self.logger.info(f"Goodness ratio: {goodness_ratio}") model.eval() correct = 0 with torch.no_grad(): for data, target in self.test_data: data = data.to(device).reshape(-1, 28 * 28) target = target.to(device) pred, _ = model.positive_eval(data, theta) correct += pred.eq(target.view_as(pred)).sum().item() self.logger.info( f"Test accuracy: {correct} / 10000 ({correct / 10000 * 100}%)" ) class NLPForwardForwardTrainer(BaseForwardForwardTrainer): def _train( self, epochs: int, theta: float, device: str, predicted_tokens: int, **kwargs, ): model = self.model.to(device) self.model.epochs = epochs self.model.predicted_tokens = predicted_tokens token_num = len(VOCABULARY) sequence_len = self.model.seq_len for input_data in self.train_data: input_data = torch.functional.F.one_hot( input_data[0].to(device), num_classes=token_num ).float() accumulated_goodness = model.LM_ff_train(input_data, theta=theta) goodness_ratio = ( accumulated_goodness[0] - accumulated_goodness[1] ) / abs(max(accumulated_goodness)) self.logger.info("Trained on batch") self.logger.info(f"Accumulated goodness: {accumulated_goodness}") self.logger.info(f"Accumulated goodness ratio: {goodness_ratio}") for test_data in self.test_data: test_data = torch.functional.F.one_hot( test_data[0].to(device), num_classes=token_num ).float() test_data = test_data.reshape(-1, token_num * sequence_len) predictions, _ = model.positive_eval(test_data, theta) perplexity = compute_perplexity(predictions) self.logger.info(f"Perplexity: {perplexity}") ================================================ FILE: optimization/forward_forward/forward_forward/root_op.py ================================================ from enum import Enum from nebullvm.operations.base import Operation from forward_forward.operations.build_models import ( FCNetFFProgressiveBuildOperation, RecurrentFCNetFFBuildOperation, LMFFNetBuildOperation, ) from forward_forward.operations.data import ( MNISTDataLoaderOperation, AesopFablesDataLoaderOperation, ) from forward_forward.operations.trainers import ( ForwardForwardTrainer, RecurrentForwardForwardTrainer, NLPForwardForwardTrainer, ) class ForwardForwardModelType(Enum): PROGRESSIVE = "progressive" RECURRENT = "recurrent" NLP = "nlp" class ForwardForwardRootOp(Operation): def __init__(self, model_type: ForwardForwardModelType): super().__init__() if model_type is ForwardForwardModelType.PROGRESSIVE: self.build_model = FCNetFFProgressiveBuildOperation() self.train_model = ForwardForwardTrainer() self.load_data = MNISTDataLoaderOperation() elif model_type is ForwardForwardModelType.RECURRENT: self.build_model = RecurrentFCNetFFBuildOperation() self.train_model = RecurrentForwardForwardTrainer() self.load_data = MNISTDataLoaderOperation() elif model_type is ForwardForwardModelType.NLP: self.build_model = LMFFNetBuildOperation() self.train_model = NLPForwardForwardTrainer() self.load_data = AesopFablesDataLoaderOperation() def execute( self, input_size: int, n_layers: int, hidden_size: int, optimizer_name: str, optimizer_params: dict, loss_fn_name: str, batch_size: int, epochs: int, shuffle: bool, theta: float, device: str, output_size: int = None, **kwargs, ): if self.build_model.get_result() is None: self.build_model.execute( input_size=input_size, n_layers=n_layers, hidden_size=hidden_size, optimizer_name=optimizer_name, optimizer_params=optimizer_params, loss_fn_name=loss_fn_name, output_size=output_size, ) if self.load_data.get_result() is None: self.load_data.execute(batch_size=batch_size, shuffle=shuffle) if ( self.build_model.get_result() is not None and self.load_data.get_result() is not None ): if self.train_model.get_result() is None: train_loader, test_loader = self.load_data.get_result() self.train_model.execute( model=self.build_model.get_result(), train_data=train_loader, test_data=test_loader, epochs=epochs, theta=theta, device=device, **kwargs, ) if self.train_model.get_result() is not None: self.state["model"] = self.train_model.get_result() def get_result(self): return self.state.get("model") ================================================ FILE: optimization/forward_forward/forward_forward/utils/__init__.py ================================================ ================================================ FILE: optimization/forward_forward/forward_forward/utils/labels.py ================================================ from typing import List import torch class LabelsInjector: def __init__(self, labels: List): # save labels into a dict having label as key and a tensor of size # len(labels) as value. The tensor contains ones up to the index of # the label and zeros after. self.label_names = labels self.labels = [ torch.nn.functional.one_hot( torch.tensor([i]), len(labels) ).reshape(-1) for i in range(len(labels)) ] @torch.no_grad() def inject_train(self, input_image: torch.Tensor, labels: torch.Tensor): # inject label in the input image bs = input_image.shape[0] injecting_labels = torch.stack( [self.labels[label] for label in labels] ) negative_injecting_labels = torch.stack( [ self.labels[label] for label in select_random_different_label( labels, len(self.labels) ) ] ) positive_images = torch.cat( [input_image.reshape(bs, -1), injecting_labels], dim=1 ) negative_images = torch.cat( [input_image.reshape(bs, -1), negative_injecting_labels], dim=1 ) images = torch.cat([positive_images, negative_images], dim=0) signs = torch.cat([torch.ones(bs), -torch.ones(bs)], dim=0) return images, signs @torch.no_grad() def inject_eval(self, input_image: torch.Tensor): # input image is expected to have batch size 1 # TODO: FIX THIS BEHAVIOUR labels = torch.stack(self.labels).unsqueeze(0) labels = labels.repeat(input_image.shape[0], 1, 1) input_image = input_image.reshape(input_image.shape[0], -1).unsqueeze( 1 ) replicated_input = input_image.repeat(1, len(self.labels), 1) new_input = torch.cat([replicated_input, labels], dim=2) return new_input # .reshape(-1, new_input.shape[2]) def select_random_different_label(labels: torch.Tensor, n_classes: int): # select a random label different from the given one for label in enumerate(labels): samples = torch.randint(0, n_classes, (1,)) while samples[0] == label: samples = torch.randint(0, n_classes, (1,)) yield samples[0] ================================================ FILE: optimization/forward_forward/forward_forward/utils/modules.py ================================================ from abc import ABC, abstractmethod from typing import List import torch import torch.utils.data from forward_forward.utils.utils import ProgressiveTrainingDataset def loss_fn(y, theta, sign): logits = torch.square(y).mean(dim=1) - theta loss = -logits * sign with torch.no_grad(): accumulated_logits = logits.mean().item() loss = loss.mean() return loss, accumulated_logits def probabilistic_loss_fn(y, theta, sign): logits = torch.square(y).mean(dim=1) - theta prob = torch.sigmoid(logits) loss = -torch.log(prob + 1e-6) * sign with torch.no_grad(): accumulated_logits = logits.mean().item() loss = loss.mean() return loss, accumulated_logits def alternative_loss_fn(y, theta, sign): logits = y.pow(2).mean(dim=1) - theta with torch.no_grad(): accumulated_logits = logits.mean().item() logits = -logits * sign prob = torch.nan_to_num(torch.exp(logits)) loss = torch.log(1 + prob) loss = loss.mean() return loss, accumulated_logits class BaseFFLayer(torch.nn.Module, ABC): @abstractmethod def ff_train( self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float ): raise NotImplementedError @abstractmethod def positive_eval(self, input_tensor: torch.Tensor, theta: float): raise NotImplementedError @property def requires_training(self): return True class FFLayer(BaseFFLayer): """Layer wrapper for efficient forward-forward layers.""" def __init__( self, layer, optimizer_name: str, optimizer_kwargs: dict, loss_fn_name: str = "loss_fn", ): super().__init__() self.layer = layer self.optimizer = getattr(torch.optim, optimizer_name)( layer.parameters(), **optimizer_kwargs ) if loss_fn_name == "loss_fn": self.loss_fn = loss_fn elif loss_fn_name == "alternative_loss_fn": self.loss_fn = alternative_loss_fn elif loss_fn_name == "probabilistic_loss_fn": self.loss_fn = probabilistic_loss_fn def forward(self, x): return self.layer(x) def ff_train( self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float ): """Train the layer with the given target.""" # upgrade optimizer for positive goodness y = self(input_tensor.detach()) y_pos = y[torch.where(signs == 1)] y_neg = y[torch.where(signs == -1)] # y_pos = self(input_tensor.detach()[torch.where(signs == 1)]) loss_pos, cumulated_logits_pos = self.loss_fn(y_pos, theta, sign=1) # self.optimizer.zero_grad() # loss_pos.backward() # print(loss_pos.item()) # self.optimizer.step() # y_neg = self(input_tensor.detach()[torch.where(signs == -1)]) loss_neg, cumulated_logits_neg = self.loss_fn(y_neg, theta, sign=-1) self.optimizer.zero_grad() loss = loss_pos + loss_neg loss.backward() self.optimizer.step() separation = [cumulated_logits_pos, cumulated_logits_neg] y = torch.zeros( input_tensor.shape[0], *y_pos.shape[1:], device=input_tensor.device ) y[torch.where(signs == 1)] = y_pos y[torch.where(signs == -1)] = y_neg return y.detach(), separation @torch.no_grad() def positive_eval(self, input_tensor: torch.Tensor, theta: float): """Evaluate the layer with the given input and theta.""" y = self(input_tensor) return y, torch.square(y).mean(dim=1) - theta class FFNormalization(BaseFFLayer): def __init__(self): super().__init__() def forward(self, x): l2_norm = ( torch.norm(x.reshape(x.shape[0], -1), p=2, dim=1, keepdim=True) + 1e-8 ) return x / l2_norm def ff_train( self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float ): with torch.no_grad(): output = self() return output, None @torch.no_grad() def positive_eval(self, input_tensor: torch.Tensor, theta: float): with torch.no_grad(): output = self(input_tensor) return output, torch.zeros( input_tensor.shape[0], device=input_tensor.device ) @property def requires_training(self): return False class LinearReLU(torch.nn.Module): def __init__(self, in_features, out_features): super().__init__() self.linear = torch.nn.Linear(in_features, out_features, bias=True) self.relu = torch.nn.ReLU() def forward(self, x): return self.relu(self.linear(x)) class FCNetFFProgressive(BaseFFLayer): """FCNet trained using forward-forward algorithm. The network is trained in a progressive manner, i.e. the first layer is trained, then the second layer, and so on. """ def __init__( self, layer_sizes: list, optimizer_name: str, optimizer_kwargs: dict, epochs: int, loss_fn_name: str = "loss_fn", ): super().__init__() self.epochs = epochs self.layers = torch.nn.ModuleList() for i in range(len(layer_sizes) - 1): self.layers.append(FFNormalization()) self.layers.append( FFLayer( LinearReLU(layer_sizes[i], layer_sizes[i + 1]), optimizer_name, optimizer_kwargs, loss_fn_name, ) ) def forward(self, x): for layer in self.layers: x = layer(x) return x def progressive_train(self, dl: torch.utils.data.DataLoader, theta: float): """Train the network in a progressive manner.""" print("Training the network in a progressive manner.") for i, layer in enumerate(self.layers): if layer.requires_training: for epoch in range(self.epochs): accumulated_separation = None for j, (data, signs) in enumerate(dl): data = data.to(self.device) signs = signs.to(self.device) _, separation = layer.ff_train(data, signs, theta) if accumulated_separation is None: accumulated_separation = separation else: accumulated_separation[0] += separation[0] accumulated_separation[1] += separation[1] if j % 100 == 0: print(f"Epoch: {epoch}, Batch: {j}, Layer: {i}") print(f"Epoch {epoch} of layer {i} done.") accumulated_separation[0] /= len(dl.dataset) accumulated_separation[1] /= len(dl.dataset) separation_ratio = ( accumulated_separation[0] - accumulated_separation[1] ) / abs(max(accumulated_separation)) print("Goodness: ", accumulated_separation) print(f"Accumulated separation: {separation_ratio}") print(f"Finished training layer {i} / {len(self.layers)}.") # create a new dataloader for the next layer dataset = ProgressiveTrainingDataset( ( (layer(x.to(self.device)), sign.to(self.device)) for x, sign in dl ) ) batch_size = dl.batch_size dl = torch.utils.data.DataLoader( dataset, batch_size=batch_size, shuffle=False ) print("Finished training the network.") def ff_train( self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float ): """Train the network with the given target.""" accumulated_separation = None for layer in self.layers: input_tensor, separation = layer.ff_train( input_tensor, signs, theta ) if accumulated_separation is None: accumulated_separation = separation else: accumulated_separation[0] += separation[0] accumulated_separation[1] += separation[1] return input_tensor, accumulated_separation @torch.no_grad() def positive_eval(self, input_tensor: torch.Tensor, theta: float): """Evaluate the network with the given input and theta.""" accumulated_goodness = torch.zeros( input_tensor.shape[0], device=input_tensor.device ) for i, layer in enumerate(self.layers): input_tensor, goodness = layer.positive_eval(input_tensor, theta) if i > 1: accumulated_goodness += goodness return input_tensor, accumulated_goodness @property def device(self): return next(self.parameters()).device class NormLinearReLU(torch.nn.Module): def __init__(self, in_features, out_features): super().__init__() self.norm = FFNormalization() self.linear_relu = LinearReLU(in_features, out_features) def forward(self, x): return self.linear_relu(self.norm(x)) class RecurrentFFLayer(BaseFFLayer): def __init__( self, hidden_size: int, optimizer_name: str, optimizer_kwargs: dict, loss_fn_name: str, ): super().__init__() self.layer = NormLinearReLU(2 * hidden_size, hidden_size) self.optimizer = getattr(torch.optim, optimizer_name)( self.layer.parameters(), **optimizer_kwargs ) self.loss_fn = eval(loss_fn_name) def forward(self, x_prev, x_same, x_next): x = torch.cat((x_prev, x_next), dim=1) new_x = self.layer(x) new_x = 0.3 * x_same + 0.7 * new_x return new_x def ff_train( self, x_prev: torch.Tensor, x_same: torch.Tensor, x_next: torch.Tensor, signs: torch.Tensor, theta: float, ): new_x = self(x_prev.detach(), x_same.detach(), x_next.detach()) y_pos = new_x[signs == 1] y_neg = new_x[signs == -1] loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1) loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1) loss = loss_pos + loss_neg self.optimizer.zero_grad() loss.backward() self.optimizer.step() return new_x, [goodness_pos, goodness_neg] @torch.no_grad() def positive_eval( self, x_prev: torch.Tensor, x_same: torch.Tensor, x_next: torch.Tensor, theta: float, ): new_x = self(x_prev, x_same, x_next) goodness = new_x.pow(2).mean(dim=1) - theta return new_x, goodness class RecurrentProjectionFFLayer(BaseFFLayer): def __init__( self, input_size: int, output_size: int, optimizer_name: str, optimizer_kwargs: dict, loss_fn_name: str, ): super().__init__() self.layer = NormLinearReLU(input_size, output_size) self.optimizer = getattr(torch.optim, optimizer_name)( self.layer.parameters(), **optimizer_kwargs ) self.loss_fn = eval(loss_fn_name) def forward(self, x: torch.Tensor): return self.layer(x) def ff_train( self, x: torch.Tensor, signs: torch.Tensor, theta: float, ): new_x = self(x.detach()) y_pos = new_x[signs == 1] y_neg = new_x[signs == -1] loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1) loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1) loss = loss_pos + loss_neg self.optimizer.zero_grad() loss.backward() self.optimizer.step() return new_x, [goodness_pos, goodness_neg] @torch.no_grad() def positive_eval(self, x: torch.Tensor, theta: float): new_x = self(x) goodness = new_x.pow(2).mean(dim=1) - theta return new_x, goodness class RecurrentProjectedSoftmaxFFLayer(BaseFFLayer): def __init__( self, input_size: int, output_size: int, optimizer_name: str, optimizer_kwargs: dict, loss_fn_name: str, ): super().__init__() self.loss_fn = eval(loss_fn_name) self.norm = FFNormalization() self.linear = torch.nn.Linear(input_size, output_size) self.softmax = torch.nn.Softmax(dim=1) self.optimizer = getattr(torch.optim, optimizer_name)( self.linear.parameters(), **optimizer_kwargs ) def forward(self, x: torch.Tensor): x = self.norm(x) x = self.linear(x) x = self.softmax(x) return x def ff_train( self, x: torch.Tensor, signs: torch.Tensor, theta: float, ): new_x = self(x.detach()) y_pos = new_x[signs == 1] y_neg = new_x[signs == -1] loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1) loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1) loss = loss_pos + loss_neg self.optimizer.zero_grad() loss.backward() self.optimizer.step() return new_x, [goodness_pos, goodness_neg] @torch.no_grad() def positive_eval(self, x: torch.Tensor, theta: float): new_x = self(x) goodness = new_x.pow(2).mean(dim=1) - theta return new_x, goodness class RecurrentFCNetFF(BaseFFLayer): """Recurrent FCNet trained using forward-forward algorithm.""" def __init__( self, layer_sizes: list, optimizer_name: str, optimizer_kwargs: dict, loss_fn_name: str = "loss_fn", ): super().__init__() self.time_steps = 8 self.test_time_steps = 8 self.storable_time_steps = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # self.storable_time_steps = [3, 4, 5] self.states = [] self.layers = torch.nn.ModuleList() self.projector = RecurrentProjectionFFLayer( layer_sizes[0], layer_sizes[1], optimizer_name, optimizer_kwargs, loss_fn_name, ) for i in range(1, len(layer_sizes) - 1): self.layers.append( RecurrentFFLayer( layer_sizes[i], optimizer_name, optimizer_kwargs, loss_fn_name, ) ) self.proj_y = RecurrentProjectionFFLayer( layer_sizes[-1], layer_sizes[-2], optimizer_name, optimizer_kwargs, loss_fn_name, ) self.softmax = RecurrentProjectedSoftmaxFFLayer( layer_sizes[-2], layer_sizes[-1], optimizer_name, optimizer_kwargs, loss_fn_name, ) self.num_labels = layer_sizes[-1] @property def device(self): return next(self.parameters()).device @torch.no_grad() def bottom_up(self, x: torch.Tensor, y: torch.Tensor): states = [] x_proj = self.projector(x) for layer in self.layers: states.append(x_proj) x_proj = layer( x_proj, torch.zeros_like(x_proj, device=self.device), torch.zeros_like(x_proj, device=self.device), ) states.append(x_proj) states.append(y) y_arg = torch.argmax(y, dim=1) x_proj_ = x_proj.clone() x_proj_[torch.arange(x_proj.shape[0]), y_arg] = -1e6 neg_prob = self.softmax(x_proj_) cumulative_neg_prob = torch.cumsum(neg_prob, dim=1) neg_samples = torch.argmax( 1.0 * ( cumulative_neg_prob > torch.rand(x.shape[0], 1).to(self.device) ), dim=1, ) neg_samples = torch.functional.F.one_hot( neg_samples, num_classes=self.num_labels ) return states, neg_samples def forward(self, x: torch.Tensor, prev_states: List[torch.Tensor]): x_proj = self.projector(x) new_states = [] for i, layer in enumerate(self.layers): if i < len(self.layers) - 1: next_state = prev_states[i + 2] else: next_state = self.proj_y(prev_states[i + 2].float()) new_states.append(x_proj) x_proj = layer(prev_states[i], prev_states[i + 1], next_state) new_states.append(x_proj) y = self.softmax(x_proj) new_states.append(y) return new_states def ff_train( self, input_tensor: torch.Tensor, labels: torch.Tensor, theta: float ): """Train the network with the given target.""" with torch.no_grad(): states, neg_samples = self.bottom_up(input_tensor, labels) neg_states, _ = self.bottom_up(input_tensor, neg_samples) states = [ torch.cat([s, ns], dim=0) for s, ns in zip(states, neg_states) ] signs = torch.cat( [ torch.ones(input_tensor.shape[0], device=self.device), -torch.ones(input_tensor.shape[0], device=self.device), ], dim=0, ) input_tensor = torch.cat([input_tensor, input_tensor], dim=0) # states have been created, now we can train the network x_proj, accumulated_goodness = self.projector.ff_train( input_tensor, signs, theta ) for _ in range(self.time_steps): new_states = [] x = x_proj for j, layer in enumerate(self.layers): if j < len(self.layers) - 1: next_state = states[j + 2] else: next_state = self.proj_y(states[j + 2].float()) new_states.append(x) x, goodnesses = layer.ff_train( states[j], states[j + 1], next_state, signs, theta ) accumulated_goodness[0] += goodnesses[0] accumulated_goodness[1] += goodnesses[1] new_states.append(x) with torch.no_grad(): x_ = states[-2][torch.where(signs == -1)] real_y = states[-1][torch.where(signs == 1)] x_[ torch.arange(x_.shape[0]), torch.argmax(real_y, dim=1) ] = -1e6 y = self.softmax(x_) cumulative_y = torch.cumsum(y, dim=1) neg_samples = torch.argmax( 1.0 * ( cumulative_y > torch.rand(x_.shape[0], 1).to(self.device) ), dim=1, ) neg_samples = torch.functional.F.one_hot( neg_samples, num_classes=self.num_labels ) # replace just negative samples next_labels = states[-1].clone() next_labels[torch.where(signs == -1)] = neg_samples new_states.append(next_labels) states = new_states accumulated_goodness[0] /= self.time_steps * len(self.layers) + 1 accumulated_goodness[1] /= self.time_steps * len(self.layers) + 1 with torch.no_grad(): states = [t[: input_tensor.shape[0] // 2] for t in states] return states, accumulated_goodness @torch.no_grad() def positive_eval(self, input_tensor: torch.Tensor, theta: float): """Evaluate the network with the given input and theta.""" labels = torch.arange(0, self.num_labels, device=self.device) labels = torch.functional.F.one_hot( labels, num_classes=self.num_labels ) original_bs = input_tensor.shape[0] input_tensor = ( input_tensor.unsqueeze(1) .repeat(1, self.num_labels, 1) .reshape(-1, input_tensor.shape[-1]) ) labels = ( labels.unsqueeze(0) .repeat(original_bs, 1, 1) .reshape(-1, labels.shape[-1]) ) states, _ = self.bottom_up(input_tensor, labels) x_proj, goodness = self.projector.positive_eval(input_tensor, theta) accumulated_goodness = goodness for time_step in range(self.test_time_steps): new_states = [] x = x_proj for j, layer in enumerate(self.layers): if j < len(self.layers) - 1: next_state = states[j + 2] else: next_state = self.proj_y(states[j + 2].float()) new_states.append(x) x, goodnesses = layer.positive_eval( states[j], states[j + 1], next_state, theta ) if time_step in self.storable_time_steps: accumulated_goodness += goodnesses new_states.append(x) if time_step in self.storable_time_steps: _, goodness = self.softmax.positive_eval(x, theta) accumulated_goodness += goodness new_states.append(states[-1]) states = new_states accumulated_goodness = accumulated_goodness.reshape( original_bs, self.num_labels ) prediction = torch.argmax(accumulated_goodness, dim=1) return prediction, accumulated_goodness class LMFFLinearSoftmax(BaseFFLayer): def __init__( self, input_size: int, output_size: int, optimizer_name: str, optimizer_kwargs: dict, ): super().__init__() self.loss_fn = torch.nn.NLLLoss() self.norm = FFNormalization() self.linear = torch.nn.Linear(input_size, output_size) self.softmax = torch.nn.Softmax(dim=1) self.optimizer = getattr(torch.optim, optimizer_name)( self.parameters(), **optimizer_kwargs ) def forward(self, x: torch.Tensor): x = self.norm(x) x = self.linear(x) x = self.softmax(x) return x def ff_train( self, input_tensor: torch.Tensor, labels: torch.Tensor, signs: torch.Tensor, ): x = input_tensor[torch.where(signs == 1)] y = labels[torch.where(signs == 1)] x = self(x) loss = self.loss_fn(x, torch.argmax(y, dim=1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() with torch.no_grad(): x_neg = input_tensor[torch.where(signs == -1)] new_y_neg = self(x_neg) new_x = torch.zeros( len(input_tensor), *x.shape[1:], device=input_tensor.device ) new_x[torch.where(signs == 1)] = x new_x[torch.where(signs == -1)] = new_y_neg return new_x, loss.item() @torch.no_grad() def positive_eval(self, x: torch.Tensor): pred = self(x) return pred class LMFFNet(BaseFFLayer): def __init__( self, token_num: int, hidden_size: int, n_layers: int, seq_len: int, predicted_tokens: int, epochs: int, optimizer_name: str, optimizer_kwargs: dict, loss_fn_name: str = "loss_fn", ): super().__init__() self.token_num = token_num self.hidden_size = hidden_size self.seq_len = seq_len self.predicted_tokens = predicted_tokens self.token2emb = RecurrentProjectionFFLayer( token_num * seq_len, hidden_size, optimizer_name, optimizer_kwargs, loss_fn_name, ) self.layers = torch.nn.ModuleList( [ FFLayer( NormLinearReLU(hidden_size, hidden_size), optimizer_name, optimizer_kwargs, loss_fn_name, ) for _ in range(n_layers) ] ) self.emb2token = LMFFLinearSoftmax( n_layers * hidden_size, token_num, optimizer_name, optimizer_kwargs ) self.epochs = epochs def forward(self, input_tensor: torch.Tensor): x = self.token2emb(input_tensor) xs = [] for layer in self.layers: x = layer(x) xs.append(x) x = torch.cat(xs, dim=1) x = self.emb2token(x) return x def ff_train( self, input_tensor: torch.Tensor, prev_pred: torch.Tensor, labels: torch.Tensor, theta: float, ): signs = torch.cat( [ torch.ones(input_tensor.shape[0], device=input_tensor.device), -torch.ones(input_tensor.shape[0], device=input_tensor.device), ] ) input_tensor = torch.cat([input_tensor, prev_pred], dim=0) labels = torch.cat([labels, labels], dim=0) for idx in range(self.epochs): x, goodness = self.token2emb.ff_train(input_tensor, signs, theta) if idx % 20 == 0: print(f"Epoch {idx}: {goodness}") accumulated_goodness = goodness xs = [] for layer in self.layers: for epoch in range(self.epochs): x_new, goodness = layer.ff_train(x, signs, theta) if epoch % 20 == 0: print(f"Epoch {epoch}: {goodness}") x = x_new xs.append(x) accumulated_goodness[0] += goodness[0] accumulated_goodness[1] += goodness[1] x = torch.cat(xs, dim=1) for epoch in range(self.epochs): x_new, loss = self.emb2token.ff_train(x, labels, signs) if epoch % 20 == 0 or epoch < 20: print(f"Epoch {epoch}: {loss}") x = x_new next_input = input_tensor[signs == 1].roll(-self.token_num, dims=1) next_input[ :, -self.token_num : # noqa E203 ] = torch.functional.F.one_hot( torch.argmax(x[signs == 1], dim=1), num_classes=self.token_num ) return next_input, accumulated_goodness def LM_ff_train(self, input_tensor: torch.Tensor, theta: float): with torch.no_grad(): input_tensor = input_tensor.reshape( -1, self.token_num * self.seq_len ) labels = input_tensor[:, -self.token_num :].roll( # noqa E203 -1, dims=0 ) temp = torch.argmax(labels, dim=1) print(temp.shape, torch.sum(temp == 0)) pred = self(input_tensor) new_char = torch.functional.F.one_hot( torch.argmax(pred, dim=1), num_classes=self.token_num ) prev_pred = input_tensor.clone().roll(1) prev_pred[:, -self.token_num :] = new_char # noqa E203 _, accumulated_goodness = self.ff_train( input_tensor, prev_pred, labels, theta ) return accumulated_goodness @torch.no_grad() def positive_eval(self, input_tensor: torch.Tensor, theta: float): cumulated_goodness = torch.zeros( input_tensor.shape[0], device=input_tensor.device ) prediction = torch.zeros( input_tensor.shape[0], self.predicted_tokens, self.token_num, device=input_tensor.device, ) for idx in range(self.predicted_tokens): x, goodness = self.token2emb.positive_eval(input_tensor, theta) cumulated_goodness += goodness xs = [] for layer in self.layers: x, goodness = layer.positive_eval(x, theta) xs.append(x) cumulated_goodness += goodness x = torch.cat(xs, dim=1) x = self.emb2token.positive_eval(x) prediction[:, idx] = x input_tensor = input_tensor.roll(-self.token_num, dims=1) input_tensor[ :, -self.token_num : # noqa E203 ] = torch.functional.F.one_hot( torch.argmax(x, dim=1), num_classes=self.token_num ) cumulated_goodness /= self.predicted_tokens return prediction, cumulated_goodness ================================================ FILE: optimization/forward_forward/forward_forward/utils/utils.py ================================================ from collections import Generator import torch.utils.data class ProgressiveTrainingDataset(torch.utils.data.Dataset): """Dataset for progressive training.""" def __init__(self, dataset_generator: Generator): with torch.no_grad(): self.internal_dataset = [ batch for data, sign in dataset_generator for batch in zip(data, sign) ] def __getitem__(self, index): return self.internal_dataset[index] def __len__(self): return len(self.internal_dataset) def compute_perplexity(tensor: torch.Tensor): """Compute perplexity of a tensor. The tensor has shape (batch_size, sequence_length, vocab_size). The softmax has already been computed over the vocab dimension. """ return torch.exp(-torch.sum(tensor * torch.log(tensor), dim=-1)).mean() ================================================ FILE: optimization/forward_forward/requirements.txt ================================================ torch>=1.9 torchvision>=0.10 nebullvm>=0.6 ================================================ FILE: optimization/forward_forward/setup.py ================================================ from pathlib import Path from setuptools import setup, find_packages REQUIREMENTS = [ "torch>=1.9", "torchvision>=0.10", "nebullvm>=0.6", ] this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text(encoding="utf8") setup( name="forward_forward", version="0.0.1", packages=find_packages(), install_requires=REQUIREMENTS, long_description=long_description, include_package_data=True, long_description_content_type="text/markdown", ) ================================================ FILE: optimization/large_speedster/README.md ================================================ # ⚡ LargeSpeedster App (WIP) Automatically apply SOTA optimization techniques on large AI models to achieve the maximum acceleration on your hardware. If you like this App, give us a star to show your support for the project ⭐ ## 📚 Description The LargeSpeedster App is a powerful tool to optimize large AI models (LMs). Leveraging state-of-the-art open-source optimization tools, LargeSpeedster enables the acceleration of large models, i.e. models with a number of parameters in excess of what could be stored on a single GPU. The workflow consists in 3 steps: select, search, and serve. In the select step, users input their large model in their preferred deep learning framework and express their preferences regarding maximum consented accuracy loss. This information is used to guide the optimization process and ensure that the resulting model meets the user's needs. In the search step, the App automatically tests multiple LMs-specific optimization techniques across the software-to-hardware stack, such as SmoothQuant quantization, FlashAttention, and inference-specific kernels. The App also tunes the optimal parallelization strategy and its configuration parameters, allowing it to find the optimal configuration of techniques for accelerating the model. Finally, in the serve step, the App returns an accelerated version of the user's model in the DL framework of choice, providing a significant boost in performance. Overall, LargeSpeedster is an easy-to-use tool that allows users to optimize their large AI models and get the most out of their software-to-hardware stack. Try it out today, and reach out if you have any feedback! ================================================ FILE: optimization/nebullvm/.pre-commit-config.yaml ================================================ repos: - repo: https://github.com/ambv/black rev: 22.3.0 hooks: - id: black args: [--line-length=79] - repo: https://github.com/pycqa/flake8 rev: 3.9.2 hooks: - id: flake8 args: [--exclude=nebullvm/tools/diffusers.py] ================================================ FILE: optimization/nebullvm/CONTRIBUTING.md ================================================ # Guidelines for Contributing to Nebullvm 🚀 Hello coder 👋 We are very happy that you have decided to contribute to the library and we thank you for your efforts. Here you can find guidelines on how to standardize your code with the style we adopted for `nebullvm`. But remember, there are various ways to help the community other than submitting code contributions, answering questions and improving the documentation are also very valuable. It also helps us if you mention our library in your blog posts to show off the cool things it's made possible, or just give the repository a ⭐️ to show us that you appreciate the project This guide was inspired by the awesome [Transformers](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md) guide to contributing. We hope to come across your pull request soon! Happy coding 💫 The nebullvm Team ## How to submit an issue Did you spot a bug? Did you come up with a cool idea that you think should be implemented in nebullvm? Well, GitHub issues are the best way to let us know! We don't have a strict policy on issue generation, just use a meaningful title and specify the problem or your proposal in the first problem comment. Then, you can use GitHub labels to let us know what kind of proposal you are making, for example `bug` if you are reporting a bug or `enhancement` if you are proposing a library improvement. ## How to contribute to solve an issue We are always delighted to welcome other people to the contributors section of nebullvm! We are looking forward to welcoming you to the community, here are some guidelines to follow: 1. Please [fork](https://github.com/nebuly-ai/nebullvm/fork) the [library](https://github.com/nebuly-ai/nebullvm) by clicking on the Fork button on the repository's page. This will create a copy of the repository in your GitHub account. 2. Clone your fork to your local machine, and add the base repository as a remote: ```bash $ git clone git@github.com:/nebuly-ai/nebullvm.git $ cd nebullvm $ git remote add upstream https://github.com/nebuly-ai/nebullvm.git ``` 3. Install the library in editable mode with the following command: ```bash $ pip install -e . ``` 4. Work on your fork to develop the feature you have in mind. 5. Nebullvm relies on `black` to format its source code consistently. To use the formatting style defined for nebullvm, run the following commands: ```bash $ pip install pre-commit black autoflake $ pre-commit install # the following command is optional, but needed if you have already # committed some files to your forked repo. $ pre-commit run --all-files ``` As for the naming convention, we follow [PEP 8](https://peps.python.org/pep-0008/) for code and a slight variation of [Google convention](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for docstrings. For docstrings we redundantly express the input type in both the function definition and the function docstring. 6. Once you're happy with your changes, add changed files with git add and commit your code: ```bash $ git add edited_file.py $ git commit -m "Add a cool feature" ``` 7. Push your changes to your repo: ```bash $ git push ``` 8. Now you can go to the repo you have forked on your github profile and press on **Pull Request** to open a pull request. In the pull request specify which problems it is solving. For instance, if the pull request solves `Issue #1`, the comment should be `Closes #1`. Also make the title of the pull request meaningful and self-explanatory. --- See you soon in the list of nebullvm contributors 🌈 ================================================ FILE: optimization/nebullvm/Dockerfile ================================================ ARG STARTING_IMAGE=nvcr.io/nvidia/tensorrt:23.03-py3 FROM ${STARTING_IMAGE} WORKDIR / # Set frontend as non-interactive ARG DEBIAN_FRONTEND=noninteractive RUN apt-get -y update && apt-get -y upgrade RUN apt-get install ffmpeg libsm6 libxext6 -y # Install other libraries RUN apt-get install -y sudo wget # Install libraries RUN python3 -m pip install --upgrade pip \ && pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu118 \ && pip install --no-cache-dir tensorflow \ && pip install --no-cache-dir xformers \ && pip install --no-cache-dir accelerate \ && python3 -m pip install --no-cache-dir --upgrade tensorrt # Copy the working dir to the container COPY ../.. /nebullvm # Install nebullvm ARG NEBULLVM_VERSION=latest RUN if [ "$NEBULLVM_VERSION" = "latest" ] ; then \ cd nebullvm ; \ pip install . ; \ cd apps/accelerate/speedster ; \ pip install . ; \ cd ../../../.. ; \ rm -rf nebullvm ; \ else \ pip install --no-cache-dir nebullvm==${NEBULLVM_VERSION} ; \ fi # Install required python modules RUN pip install --no-cache-dir cmake # Install default deep learning compilers ARG COMPILER=all RUN if [ "$COMPILER" = "all" ] ; then \ python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers all ; \ elif [ "$COMPILER" = "tensorrt" ] ; then \ python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers tensorrt ; \ elif [ "$COMPILER" = "openvino" ] ; then \ python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers openvino ; \ elif [ "$COMPILER" = "onnxruntime" ] ; then \ python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers onnxruntime ; \ fi # Install TVM RUN if [ "$COMPILER" = "all" ] || [ "$COMPILER" = "tvm" ] ; then \ pip install --no-cache-dir https://github.com/tlc-pack/tlcpack/releases/download/v0.11.1/tlcpack_cu116-0.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ; \ pip install --no-cache-dir xgboost ; \ python3 -c "from tvm.runtime import Module" ; \ fi ENV SIGOPT_PROJECT="tmp" ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.8/dist-packages/tensorrt ENV CUDA_MODULE_LOADING="LAZY" ================================================ FILE: optimization/nebullvm/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: optimization/nebullvm/MANIFEST.in ================================================ recursive-include nebullvm/installers/tvm_installers *.cmake recursive-include nebullvm/installers *.sh ================================================ FILE: optimization/nebullvm/README.md ================================================







A framework for building optimization modules to boost the performances of your AI systems

--- **Documentation**: docs.nebuly.com/ --- `Nebullvm` is a framework for building the optimization modules needed to optimize the performances of your AI systems. The optimization modules are stack-agnostic and work with any library. They are designed to be easily integrated into your system, providing a quick and seamless boost to its performance. Simply plug and play to start realizing the benefits of optimized performance right away. If you like the idea, give us a star to show your support for the project ⭐ ## **What can this help with?** There are multiple modules we actually provide built on top of the framework: ✅ [Speedster](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster): Automatically apply the best set of SOTA optimization techniques to achieve the maximum inference speed-up on your hardware. ✅ [OpenAlphaTensor](https://github.com/nebuly-ai/nebuly/tree/main/optimization/open_alpha_tensor): Increase the computational performances of an AI model with custom-generated matrix multiplication algorithm fine-tuned for your specific hardware. ✅ [Forward-Forward](https://github.com/nebuly-ai/nebuly/tree/main/optimization/forward_forward): The Forward Forward algorithm is a method for training deep neural networks that replaces the backpropagation forward and backward passes with two forward passes. ## Next modules and roadmap We are actively working on incorporating the following modules, as requested by members of our community, in upcoming releases: - [ ] [CloudSurfer](https://github.com/nebuly-ai/nebuly/blob/main/optimization/cloud_surfer): Automatically discover the optimal cloud configuration and hardware on AWS, GCP and Azure to run your AI models. - [ ] [OptiMate](https://github.com/nebuly-ai/nebuly/blob/main/optimizatione/optimate): Interactive tool guiding savvy users in achieving the best inference performance out of a given model / hardware setup. ## Contributing As an open source project in a rapidly evolving field, we welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the [linked](https://docs.nebuly.com/contributions) page for more information on how to get involved. ---

Join the community | Contribute to the library

================================================ FILE: optimization/nebullvm/azure-pipelines.yml ================================================ trigger: branches: include: - main paths: exclude: - .github/* - docs/** - README.md - notebooks/* pool: name: gpu-t4-pool variables: imageName: 'nebulydocker/nebullvm' steps: - script: | nvidia-smi displayName: 'Ensure cuda is installed correctly' - script: | pip uninstall -y nebullvm pip install . displayName: 'Install nebullvm' - script: | cd apps/accelerate/speedster pip uninstall -y speedster pip install . cd ../../.. displayName: 'Install speedster' - script: python -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117 displayName: 'Install PyTorch' - script: | export PATH=$PATH:/home/AzDevOps/.local/bin python -m nebullvm.installers.auto_installer --compilers all displayName: 'Install deep learning compilers' - script: | python -m pip install -r "requirements-dev.txt" pip install pytest-azurepipelines displayName: 'Install requirements for testing' - script: | res=$(python -c "from nebullvm.tools.utils import check_device; print(check_device().type.name == 'GPU')") if [ "$res" = "False" ]; then echo "GPU is not available" exit 1 fi echo "GPU is available: $res" res=$(python -c "import torch; print(torch.cuda.is_available())") if [ "$res" = "False" ]; then echo "CUDA is not available for PyTorch" exit 1 fi echo "CUDA is available for PyTorch: $res" res=$(python -c "import torch; num_devices = torch.cuda.device_count(); print(num_devices is not None and isinstance(num_devices, int) and num_devices > 0)") if [ "$res" = "False" ]; then echo "No CUDA devices found" exit 1 fi echo "CUDA devices found: $res" displayName: 'Check GPU is available' - script: | export SPEEDSTER_DISABLE_TELEMETRY=1 export PATH=$PATH:/home/AzDevOps/.local/bin cd apps/accelerate/speedster pytest cd ../../.. displayName: 'Run api tests' failOnStderr: true - script: | export PATH=$PATH:/home/AzDevOps/.local/bin cd nebullvm pytest cd ../ displayName: 'Run components tests' failOnStderr: true ================================================ FILE: optimization/nebullvm/docker_build.sh ================================================ # Create image with all compilers installed docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-allcompilers . # Create an image for each compiler installed docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-onnxruntime . --build-arg COMPILER="onnxruntime" docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-openvino . --build-arg COMPILER="openvino" docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-tvm . --build-arg COMPILER="tvm" docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-tensorrt . --build-arg COMPILER="tensorrt" ================================================ FILE: optimization/nebullvm/docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS = SPHINXBUILD = sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: optimization/nebullvm/docs/README.md ================================================ # Documentation Nebullvm documentation is built using Sphynx and furo! You can follow the guide below for ## Build the docs: 1. Install nebullvm according to [README.md](../../../README.md#step-1-installation-of-nebullvm-library). 2. Install additional libraries required to build docs: ``` pip install -r requirements-docs.txt ``` 3. Run `make html` from this directory. ================================================ FILE: optimization/nebullvm/docs/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # flake8: noqa import os import sys sys.path.insert(0, os.path.abspath("../../../")) # import sphinx_rtd_theme # -- Project information ----------------------------------------------------- project = "nebullvm" copyright = "2022, nebuly" author = "nebuly" # The full version, including alpha/beta/rc tags # release = "0.3.0" # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.napoleon", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.todo", "sphinx.ext.coverage", "sphinx.ext.mathjax", "sphinx.ext.viewcode", "sphinx.ext.githubpages", ] # -- Configurations for plugins ------------ napoleon_google_docstring = True napoleon_include_init_with_doc = True napoleon_include_special_with_doc = True napoleon_numpy_docstring = False napoleon_use_rtype = False autodoc_inherit_docstrings = False autodoc_member_order = "bysource" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # # html_theme = "sphinx_rtd_theme" html_theme = "furo" html_theme_options = { "light_css_variables": { "color-brand-primary": "#dark", "color-brand-content": "#dark", "color-admonition-background": "#dark", "font-stack": "Montserrat, sans-serif", "font-stack--monospace": "Courier, monospace", }, "footer_icons": [ { "name": "GitHub", "url": "https://github.com/nebuly-ai/nebullvm", "html": """ """, "class": "", }, ], "light_logo": "Logo_azure.svg", "dark_logo": "Logo_azure.svg", } html_static_path = ["_static"] html_title = "" # html_theme_options = { # "announcement": "Important announcement!", # } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] ================================================ FILE: optimization/nebullvm/docs/index.rst ================================================ Welcome to nebullvm's documentation! ====================================== .. toctree:: :maxdepth: 2 modules/index ================================================ FILE: optimization/nebullvm/docs/modules/api.rst ================================================ nebullvm.api ============= .. automodule:: nebullvm :members: .. automodule:: nebullvm.api.frontend.huggingface :members: ================================================ FILE: optimization/nebullvm/docs/modules/converters.rst ================================================ nebullvm.converters =================== .. automodule:: nebullvm.converters :members: ================================================ FILE: optimization/nebullvm/docs/modules/index.rst ================================================ API Documentation ================== .. toctree:: api converters inference_learners installers optimizers ================================================ FILE: optimization/nebullvm/docs/modules/inference_learners.rst ================================================ nebullvm.inference_learners =========================== .. automodule:: nebullvm.inference_learners :members: ================================================ FILE: optimization/nebullvm/docs/modules/installers.rst ================================================ nebullvm.installers =================== .. automodule:: nebullvm.installers :members: ================================================ FILE: optimization/nebullvm/docs/modules/optimizers.rst ================================================ nebullvm.optimizers =================== .. automodule:: nebullvm.optimizers :members: ================================================ FILE: optimization/nebullvm/docs/requirements-docs.txt ================================================ Sphinx==4.5.0 coloredlogs sympy furo ================================================ FILE: optimization/nebullvm/nebullvm/__init__.py ================================================ # The torch import is necessary for a strange issue when # using cuda 11.8, if torch is imported after # tensorflow it generates a core dumped error from nebullvm.optional_modules.torch import torch # noqa F401 from nebullvm.tools.logger import setup_logger setup_logger() __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: optimization/nebullvm/nebullvm/api/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/apps/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/apps/base.py ================================================ import abc class App(abc.ABC): def __init__(self): super().__init__() @abc.abstractmethod def execute(self, **kwargs): raise NotImplementedError() ================================================ FILE: optimization/nebullvm/nebullvm/config.py ================================================ from nebullvm.optional_modules.torch import torch VERSION = "0.10.0" LEARNER_METADATA_FILENAME = "metadata.json" ONNX_OPSET_VERSION = 13 NEBULLVM_DEBUG_FILE = "nebullvm_debug.json" AUTO_TVM_TUNING_OPTION = { "tuner": "xgb", "trials": 10, "early_stopping": 100, } # TODO: remove the min_repeat_ms key AUTO_TVM_PARAMS = { "number": 10, "repeat": 1, "min_repeat_ms": 0, # since we're tuning on a CPU, can be set to 0 "timeout": 10, # in seconds } NVIDIA_FILENAMES = { "engine": "tensor_rt.engine", "metadata": LEARNER_METADATA_FILENAME, } TVM_FILENAMES = {"engine": "compiled_lib.so"} ONNX_FILENAMES = {"model_name": "model.onnx"} ONNX_PROVIDERS = { "cuda": [ "TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider", ], "cpu": [ "CPUExecutionProvider", ], } OPENVINO_FILENAMES = { "metadata": LEARNER_METADATA_FILENAME, "description_file": "description.xml", "weights": "weights.bin", } TENSORFLOW_BACKEND_FILENAMES = { "tflite_model": "tf_model.tflite", "tf_model": "tf_model.h5", } TORCH_TENSORRT_PRECISIONS = { "torch.float32": {torch.float}, "torch.float16": {torch.float, torch.half}, "torch.int8": {torch.float, torch.half, torch.int8}, } MIN_DIM_INPUT_DATA = 100 QUANTIZATION_DATA_NUM = 300 CONSTRAINED_METRIC_DROP_THS = 1e-2 TRAIN_TEST_SPLIT_RATIO = 0.8 COMPILER_LIST = [ "deepsparse", "tensor_rt", "torchscript", "onnxruntime", "tflite", "xla", "tvm", "openvino", "bladedisc", "intel_neural_compressor", "torch_neuron", "torch_xla", "torch_dynamo", "faster_transformer", ] COMPRESSOR_LIST = [ "sparseml", "intel_pruning", ] ONNX_MODULES = ["openvino", "tensor_rt"] TORCH_MODULES = [ "deepsparse", "intel_neural_compressor", "tensor_rt", "torch_tensor_rt", "faster_transformer", ] TENSORFLOW_MODULES = [] HUGGING_FACE_MODULES = [] DIFFUSERS_MODULES = [] LIBRARIES_GPU = ["tensor_rt", "torch_tensor_rt", "faster_transformer"] MIN_NUMBER = 1e-4 DEFAULT_METRIC_DROP_THS = 1e-2 ACTIVATION_METRIC_DROP_THS = 2e-2 ================================================ FILE: optimization/nebullvm/nebullvm/core/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/core/models.py ================================================ import subprocess from dataclasses import dataclass from enum import Enum from functools import cached_property from typing import Optional, Any, Union, Tuple, List, Dict import numpy as np from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch class DeepLearningFramework(Enum): PYTORCH = "torch" TENSORFLOW = "tensorflow" NUMPY = "numpy" class QuantizationType(Enum): DYNAMIC = "DYNAMIC" STATIC = "STATIC" HALF = "HALF" class Status(Enum): OK = "OK" ERROR = "ERROR" class DeviceType(Enum): CPU = "cpu" GPU = "gpu" TPU = "tpu" NEURON = "neuron" class DataType(str, Enum): FLOAT16 = "float16" FLOAT32 = "float32" INT32 = "int32" INT64 = "int64" @classmethod def from_framework_format( cls, dtype: Union[torch.dtype, tf.dtypes.DType, np.dtype] ): if isinstance(dtype, torch.dtype): framework = "torch" elif isinstance(dtype, tf.dtypes.DType): framework = "tensorflow" else: framework = "numpy" dtype = dtype.type return FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[framework][dtype] def to_torch_format(self): for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[ "torch" ].items(): if value == self: return key def to_tf_format(self): for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[ "tensorflow" ].items(): if value == self: return key def to_numpy_format(self): for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[ "numpy" ].items(): if value == self: return key class ModelCompiler(Enum): TENSOR_RT = "tensor_rt" TENSOR_RT_ONNX = "onnx_tensor_rt" TENSOR_RT_TORCH = "torch_tensor_rt" OPENVINO = "openvino" APACHE_TVM = "tvm" APACHE_TVM_TORCH = "torch_tvm" APACHE_TVM_ONNX = "onnx_tvm" ONNX_RUNTIME = "onnxruntime" DEEPSPARSE = "deepsparse" TORCHSCRIPT = "torchscript" XLA = "xla" TFLITE = "tflite" BLADEDISC = "bladedisc" INTEL_NEURAL_COMPRESSOR = "intel_neural_compressor" TORCH_NEURON = "torch_neuron" TORCH_XLA = "torch_xla" TORCH_DYNAMO = "torch_dynamo" FASTER_TRANSFORMER = "faster_transformer" class ModelCompressor(Enum): SPARSE_ML = "sparseml" INTEL_PRUNING = "intel_pruning" class OptimizationTime(Enum): CONSTRAINED = "constrained" UNCONSTRAINED = "unconstrained" @dataclass class HardwareSetup: cpu: str operating_system: str memory_gb: int accelerator: Optional[str] = None @dataclass class OptimizedModel: inference_learner: Any latency_seconds: float metric_drop: float technique: str compiler: str throughput: float size_mb: float @dataclass class OriginalModel: model: Any latency_seconds: float throughput: float name: str size_mb: float framework: DeepLearningFramework @dataclass class BenchmarkOriginalModelResult: """The result of the LatencyOriginalModelMeasureOp""" latency_seconds: float model_outputs: Any @dataclass class OptimizeInferenceResult: """The result of the OptimizeInferenceOp""" original_model: OriginalModel hardware_setup: HardwareSetup optimized_model: Optional[OptimizedModel] @property def metric_drop(self) -> Optional[float]: if self.optimized_model is None: return None return self.optimized_model.metric_drop @cached_property def latency_improvement_rate(self) -> Optional[float]: if self.optimized_model is None: return None if self.optimized_model.latency_seconds == 0: return -1 return ( self.original_model.latency_seconds / self.optimized_model.latency_seconds ) @cached_property def throughput_improvement_rate(self) -> Optional[float]: if self.optimized_model is None: return None if self.original_model.throughput == 0: return -1 return self.optimized_model.throughput / self.original_model.throughput @cached_property def size_improvement_rate(self) -> Optional[float]: if self.optimized_model is None: return None if self.optimized_model.size_mb == 0: return 1 return self.original_model.size_mb / self.optimized_model.size_mb class InputInfo: """Class for storing all the information needed for creating an input tensor for AI models. Attributes: size (tuple): Tuple with the input size (batch size excluded) dtype (str): Data type of the tensor. min_value (int or float, optional): Min value the tensor elements can have. max_value (int or float, optional): Max value the tensor elements can have. """ def __init__(self, size: Tuple[int, ...], dtype: str, **extra_info): self.dtype = DataType(dtype) self.size = size self.__dict__.update(extra_info) def __getattr__(self, item): return self.__dict__.get(item) def dict(self): return { k: v for k, v in self.__dict__.items() if not k.startswith("_") } @dataclass class DynamicAxisInfo: inputs: List[Dict[int, str]] outputs: List[Dict[int, str]] def dict(self): return { k: v for k, v in self.__dict__.items() if not k.startswith("_") } def retrieve_output_dim( self, input_shapes: List[Tuple[int, ...]], output_idx: int, dimension_idx: int, default_output_value: int, ) -> int: output_tag = self.outputs[output_idx][dimension_idx] for input_dict, input_shape in zip(self.inputs, input_shapes): for key, value in input_dict.items(): if ( isinstance(value, dict) and value.get("name") == output_tag ) or value == output_tag: return input_shape[key] return default_output_value @dataclass class ModelParams: batch_size: int input_infos: List[InputInfo] output_sizes: List[Tuple[int, ...]] output_types: List[DataType] dynamic_info: Union[DynamicAxisInfo, Dict] = None def __post_init__(self): if isinstance(self.dynamic_info, dict): self.dynamic_info = DynamicAxisInfo(**self.dynamic_info) self.input_infos = [ InputInfo(**x) if isinstance(x, dict) else x for x in self.input_infos ] self.output_types = [DataType(x) for x in self.output_types] def dict(self): def recursively_dictionarize(element): if isinstance(element, list): element = [recursively_dictionarize(el) for el in element] elif hasattr(element, "dict"): element = element.dict() return element return { k: recursively_dictionarize(v) for k, v in self.__dict__.items() if not k.startswith("_") } @property def input_sizes(self): for input_info in self.input_infos: yield input_info.size class Device: def __init__(self, type: DeviceType, idx: int = 0): self.type = type self.idx = idx @classmethod def from_str(cls, string: str) -> "Device": if string.startswith("cuda") or string.startswith("gpu"): return cls( DeviceType.GPU, int(string.split(":")[1] if ":" in string else 0), ) elif string.startswith("tpu"): return cls( DeviceType.TPU, int(string.split(":")[1] if ":" in string else 0), ) return cls(DeviceType.CPU) def to_torch_format(self) -> str: if self.type is DeviceType.GPU: return f"cuda:{self.idx}" elif self.type is DeviceType.TPU: return f"xla:{self.idx}" return "cpu" def to_tf_format(self) -> str: if self.type is DeviceType.GPU: return f"GPU:{self.idx}" return "CPU" def get_total_memory(self) -> int: # Return total memory in bytes using nvidia-smi in bytes if self.type is not DeviceType.GPU: raise Exception("Device type must be GPU") else: try: output = ( subprocess.check_output( "nvidia-smi --query-gpu=memory.total " "--format=csv,nounits,noheader", shell=True, ) .decode("utf-8") .split()[self.idx] ) return int(output) * 1024 * 1024 except Exception: raise Exception( "Unable to get total memory of device. " "Please make sure nvidia-smi is available." ) def get_free_memory(self) -> int: # Return free memory in bytes using nvidia-smi in bytes if self.type is not DeviceType.GPU: raise Exception("Device type must be GPU") else: try: output = ( subprocess.check_output( "nvidia-smi --query-gpu=memory.free " "--format=csv,nounits,noheader", shell=True, ) .decode("utf-8") .split()[self.idx] ) return int(output) * 1024 * 1024 except Exception: raise Exception( "Unable to get free memory of device. " "Please make sure nvidia-smi is available." ) FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT = { "torch": { torch.float16: DataType.FLOAT16, torch.float32: DataType.FLOAT32, torch.int32: DataType.INT32, torch.int64: DataType.INT64, }, "tensorflow": { tf.float16: DataType.FLOAT16, tf.float32: DataType.FLOAT32, tf.int32: DataType.INT32, tf.int64: DataType.INT64, }, "numpy": { np.float16: DataType.FLOAT16, np.float32: DataType.FLOAT32, np.int32: DataType.INT32, np.int64: DataType.INT64, }, } ================================================ FILE: optimization/nebullvm/nebullvm/core/tests/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/core/tests/test_models.py ================================================ import unittest from unittest.mock import MagicMock from nebullvm.core.models import OptimizeInferenceResult class TestOptimizeInferenceResult(unittest.TestCase): def test_latency_improvement_rate__optimized_model_is_none(self): res = OptimizeInferenceResult( original_model=MagicMock(), hardware_setup=MagicMock(), optimized_model=None, ) self.assertIsNone(res.latency_improvement_rate) def test_latency_improvement_rate__optimized_latency_is_zero(self): original_latency = 1.0 optimized_latency = 0.0 res = OptimizeInferenceResult( original_model=MagicMock(latency_seconds=original_latency), hardware_setup=MagicMock(), optimized_model=MagicMock(latency_seconds=optimized_latency), ) self.assertEqual(-1, res.latency_improvement_rate) def test_latency_improvement_rate__original_latency_is_zero(self): original_latency = 0.0 optimized_latency = 1.0 res = OptimizeInferenceResult( original_model=MagicMock(latency_seconds=original_latency), hardware_setup=MagicMock(), optimized_model=MagicMock(latency_seconds=optimized_latency), ) self.assertEqual(0, res.latency_improvement_rate) def test_latency_improvement_rate__rate_gt_1(self): original_latency = 1.0 optimized_latency = 0.5 res = OptimizeInferenceResult( original_model=MagicMock(latency_seconds=original_latency), hardware_setup=MagicMock(), optimized_model=MagicMock(latency_seconds=optimized_latency), ) self.assertGreater(res.latency_improvement_rate, 1) def test_latency_improvement_rate__rate_lt_1(self): original_latency = 0.5 optimized_latency = 1.0 res = OptimizeInferenceResult( original_model=MagicMock(latency_seconds=original_latency), hardware_setup=MagicMock(), optimized_model=MagicMock(latency_seconds=optimized_latency), ) self.assertLess(res.latency_improvement_rate, 1) def test_th_improvement_rate__optimized_model_is_none(self): res = OptimizeInferenceResult( original_model=MagicMock(), hardware_setup=MagicMock(), optimized_model=None, ) self.assertIsNone(res.throughput_improvement_rate) def test_th_improvement_rate__optimized_th_is_zero(self): original_th = 1.0 optimized_th = 0.0 res = OptimizeInferenceResult( original_model=MagicMock(throughput=original_th), hardware_setup=MagicMock(), optimized_model=MagicMock(throughput=optimized_th), ) self.assertEqual(0, res.throughput_improvement_rate) def test_th_improvement_rate__original_th_is_zero(self): original_th = 0.0 optimized_th = 1.0 res = OptimizeInferenceResult( original_model=MagicMock(throughput=original_th), hardware_setup=MagicMock(), optimized_model=MagicMock(throughput=optimized_th), ) self.assertEqual(-1, res.throughput_improvement_rate) def test_th_improvement_rate__rate_gt_1(self): original_th = 0.5 optimized_th = 1 res = OptimizeInferenceResult( original_model=MagicMock(throughput=original_th), hardware_setup=MagicMock(), optimized_model=MagicMock(throughput=optimized_th), ) self.assertGreater(res.throughput_improvement_rate, 1) def test_th_improvement_rate__rate_lt_1(self): original_th = 1.0 optimized_th = 0.5 res = OptimizeInferenceResult( original_model=MagicMock(throughput=original_th), hardware_setup=MagicMock(), optimized_model=MagicMock(throughput=optimized_th), ) self.assertLess(res.throughput_improvement_rate, 1) def test_size_improvement_rate__optimized_model_is_none(self): res = OptimizeInferenceResult( original_model=MagicMock(), hardware_setup=MagicMock(), optimized_model=None, ) self.assertIsNone(res.size_improvement_rate) def test_size_improvement_rate__optimized_size_is_zero(self): original_size = 1.0 optimized_size = 0.0 res = OptimizeInferenceResult( original_model=MagicMock(size_mb=original_size), hardware_setup=MagicMock(), optimized_model=MagicMock(size_mb=optimized_size), ) self.assertEqual(1, res.size_improvement_rate) def test_size_improvement_rate__original_size_is_zero(self): original_size = 0.0 optimized_size = 1.0 res = OptimizeInferenceResult( original_model=MagicMock(size_mb=original_size), hardware_setup=MagicMock(), optimized_model=MagicMock(size_mb=optimized_size), ) self.assertEqual(0, res.size_improvement_rate) def test_size_improvement_rate__rate_gt_1(self): original_size = 1 optimized_size = 0.5 res = OptimizeInferenceResult( original_model=MagicMock(size_mb=original_size), hardware_setup=MagicMock(), optimized_model=MagicMock(size_mb=optimized_size), ) self.assertGreater(res.size_improvement_rate, 1) def test_size_improvement_rate__rate_lt_1(self): original_size = 0.5 optimized_size = 1 res = OptimizeInferenceResult( original_model=MagicMock(size_mb=original_size), hardware_setup=MagicMock(), optimized_model=MagicMock(size_mb=optimized_size), ) self.assertLess(res.size_improvement_rate, 1) def test_metric_drop__optimized_model_is_none(self): res = OptimizeInferenceResult( original_model=MagicMock(), hardware_setup=MagicMock(), optimized_model=None, ) self.assertIsNone(res.metric_drop) def test_metric_drop(self): metric_drop = 0.1 res = OptimizeInferenceResult( original_model=MagicMock(), hardware_setup=MagicMock(), optimized_model=MagicMock(metric_drop=metric_drop), ) self.assertEqual(metric_drop, res.metric_drop) ================================================ FILE: optimization/nebullvm/nebullvm/core/types.py ================================================ from typing import Union, Iterable, Sequence from nebullvm.tools.data import DataManager InputData = Union[Iterable, Sequence, DataManager] ================================================ FILE: optimization/nebullvm/nebullvm/installers/__init__.py ================================================ # flake8: noqa __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: optimization/nebullvm/nebullvm/installers/auto_installer.py ================================================ import argparse from typing import List, Union from loguru import logger from nebullvm.config import ( ONNX_MODULES, TENSORFLOW_MODULES, TORCH_MODULES, HUGGING_FACE_MODULES, DIFFUSERS_MODULES, ) from nebullvm.installers.installers import ( ONNXInstaller, PytorchInstaller, TensorflowInstaller, HuggingFaceInstaller, DiffusersInstaller, ) SUPPORTED_BACKENDS_DICT = { "torch": ["onnx"], "tensorflow": ["onnx"], "huggingface": ["torch", "tensorflow", "onnx"], "diffusers": ["torch", "onnx"], "onnx": [], } INSTALLERS = { "onnx": ONNXInstaller, "torch": PytorchInstaller, "tensorflow": TensorflowInstaller, "huggingface": HuggingFaceInstaller, "diffusers": DiffusersInstaller, } MODULES = { "onnx": ONNX_MODULES, "torch": TORCH_MODULES, "tensorflow": TENSORFLOW_MODULES, "huggingface": HUGGING_FACE_MODULES, "diffusers": DIFFUSERS_MODULES, } def select_frameworks_to_install( include_frameworks: Union[List[str], str], include_backends: Union[List[str], str], ) -> List[str]: supported_frameworks = list(INSTALLERS.keys()) if isinstance(include_frameworks, str) and include_frameworks == "all": frameworks_list = supported_frameworks elif isinstance(include_frameworks, list): frameworks_list = [] for framework in include_frameworks: if framework in supported_frameworks: frameworks_list.append(framework) else: logger.warning(f"Framework {framework} not supported") if len(frameworks_list) == 0: raise ValueError("No supported frameworks selected") if isinstance(include_backends, str) and include_backends == "all": for framework in frameworks_list: for backend in SUPPORTED_BACKENDS_DICT[framework]: frameworks_list.append(backend) elif isinstance(include_backends, list): for backend in include_backends: if backend not in supported_frameworks: logger.warning(f"Backend {backend} not supported") else: backend_supported = False for framework in frameworks_list: if backend in SUPPORTED_BACKENDS_DICT[framework]: frameworks_list.append(backend) backend_supported = True break if not backend_supported: logger.warning( f"Backend {backend} not supported for selected " f"frameworks" ) else: raise ValueError("Invalid backends list") else: raise ValueError("Invalid frameworks list") frameworks_list = list(set(frameworks_list)) frameworks_list.sort() return frameworks_list def select_compilers_to_install( include_compilers: Union[List[str], str], framework_list: List[str] ) -> List[str]: compiler_list = [] supported_compilers = list( set([item for sublist in MODULES.values() for item in sublist]) ) if isinstance(include_compilers, str) and include_compilers == "all": compiler_list = list( set( [ item for (fr, compilers) in MODULES.items() for item in compilers if fr in framework_list ] ) ) else: for compiler in include_compilers: if compiler not in supported_compilers: logger.warning(f"Compiler {compiler} not supported") else: compiler_supported = False for framework in framework_list: if compiler in MODULES[framework]: compiler_list.append(compiler) compiler_supported = True break if not compiler_supported: logger.warning( f"Compiler {compiler} not supported for selected " f"frameworks" ) compiler_list = list(set(compiler_list)) compiler_list.sort() return compiler_list def auto_install_libraries( include_frameworks: Union[List[str], str] = "all", include_backends: Union[List[str], str] = "all", include_compilers: Union[List[str], str] = "all", ): logger.info("Running auto install of nebullvm dependencies") framework_list = select_frameworks_to_install( include_frameworks, include_backends ) compilers_list = select_compilers_to_install( include_compilers, framework_list ) for framework in framework_list: framework_installer = INSTALLERS[framework](MODULES[framework]) if not framework_installer.check_framework(): framework_installer.install_framework() framework_installer.install_dependencies(framework_list) framework_installer.install_compilers(compilers_list) def main(): parser = argparse.ArgumentParser( description="Auto install dl frameworks and dependencies" ) parser.add_argument( "-f", "--frameworks", help="The base dl frameworks to be installed", default="all", nargs="+", ) parser.add_argument( "-b", "--extra-backends", help="additional dl frameworks to be installed to " "gain the optimal speedup", default="all", nargs="+", ) parser.add_argument( "-c", "--compilers", help="Compilers to be installed", default="all", nargs="+", ) args = vars(parser.parse_args()) if len(args["frameworks"]) == 1 and args["frameworks"][0] == "all": framework_list = "all" else: framework_list = args["frameworks"] if len(args["extra_backends"]) == 1 and args["extra_backends"][0] in [ "all", "none", ]: if args["extra_backends"][0] == "all": backend_list = "all" else: backend_list = [] else: backend_list = args["extra_backends"] if len(args["compilers"]) == 1 and args["compilers"][0] == "all": compilers_list = "all" else: compilers_list = args["compilers"] auto_install_libraries(framework_list, backend_list, compilers_list) if __name__ == "__main__": main() ================================================ FILE: optimization/nebullvm/nebullvm/installers/install_bladedisc.sh ================================================ #!/bin/bash # Set non interactive mode for apt-get export DEBIAN_FRONTEND=noninteractive if [ ! -d "BladeDISC" ] then git clone https://github.com/alibaba/BladeDISC.git fi cd BladeDISC && git submodule update --init --recursive # Install bazel sudo apt install apt-transport-https curl gnupg curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg sudo mv bazel-archive-keyring.gpg /usr/share/keyrings echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list sudo apt update && sudo apt install bazel sudo apt install default-jdk if [ $1 == "true" ] then cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh else if [[ $OSTYPE == "darwin"* ]] then export TORCH_BLADE_BUILD_WITH_CUDA_SUPPORT=OFF export TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.10.0+aarch64 cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh else export TORCH_BLADE_BUILD_WITH_CUDA_SUPPORT=OFF export TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.8.1+cpu cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh fi fi cd ../.. ================================================ FILE: optimization/nebullvm/nebullvm/installers/install_fastertransformer.sh ================================================ #!/bin/bash # TODO: check requirements # https://github.com/NVIDIA/FasterTransformer/blob/main/docs/bert_guide.md # Requirements #CMake >= 3.8 for Tensorflow, CMake >= 3.13 for PyTorch #CUDA 11.0 or newer version #Python: Only verify on python 3 #Tensorflow: Verify on 1.15, 1.13 and 1.14 should work. #PyTorch: Verify on 1.8.0, >= 1.5.0 should work. # Set non interactive mode for apt-get export DEBIAN_FRONTEND=noninteractive if [[ $OSTYPE == "darwin"* ]] then echo "MacOS is not supported for FasterTransformer" exit 1 fi if [ ! -d "FasterTransformer" ] then git clone --recursive https://github.com/NVIDIA/FasterTransformer FasterTransformer fi # TODO: checkout to latest release cd FasterTransformer && mkdir -p build && cd build && cmake -DSM=$COMPUTE_CAPABILITY -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON .. && make -j8 && touch ../../FasterTransformer_build_success # create a file to indicate that the build was successful # TODO: enable multi gpu if possible #-DBUILD_MULTI_GPU=OFF ================================================ FILE: optimization/nebullvm/nebullvm/installers/install_tensor_rt.sh ================================================ #!/bin/bash if [[ "$(grep '^ID_LIKE' /etc/os-release)" == *"centos"* ]] then # Installation for centos type linux distribution # Try installation with pip if fails then install from source pip3 install --upgrade "setuptools<=65.7.0" pip # If cuda version is less than 12.0 then install tensorrt<=8.5.3.1 if [[ $(nvidia-smi | grep CUDA | awk '{print $9}' | cut -d '.' -f 1) -lt 12 ]] then python3 -m pip install --upgrade "tensorrt<=8.5.3.1" else python3 -m pip install --upgrade "tensorrt<=8.6.1" fi pip3 install colored polygraphy --extra-index-url https://pypi.ngc.nvidia.com if [[ $(python3 -c "import tensorrt; print(tensorrt.__version__); assert tensorrt.Builder(tensorrt.Logger())" || echo 1) == 1 ]] then # Uninstall previous version pip3 uninstall nvidia-tensorrt # install pre-requisites pip3 install numpy yum update && \ yum -y install glibnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev \ libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer && \ rm -rf /var/lib/apt/lists/* fi else # Try installation with pip if fails then install from source pip install --upgrade "setuptools<=65.7.0" pip # If cuda version is less than 12.0 then install tensorrt<=8.5.3.1 if [[ $(nvidia-smi | grep CUDA | awk '{print $9}' | cut -d '.' -f 1) -lt 12 ]] then python3 -m pip install --upgrade "tensorrt<=8.5.3.1" else python3 -m pip install --upgrade "tensorrt<=8.6.1" fi pip install colored polygraphy --extra-index-url https://pypi.ngc.nvidia.com if [[ $(python3 -c "import tensorrt; print(tensorrt.__version__); assert tensorrt.Builder(tensorrt.Logger())" || echo 1) == 1 ]] then # Uninstall previous version pip uninstall nvidia-tensorrt # install pre-requisites pip install numpy apt-get update && \ apt-get -y install glibnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev \ libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer && \ rm -rf /var/lib/apt/lists/* fi fi ================================================ FILE: optimization/nebullvm/nebullvm/installers/install_tvm.sh ================================================ #!/bin/bash # Set non interactive mode for apt-get export DEBIAN_FRONTEND=noninteractive if [ ! -d "tvm" ] then git clone --recursive https://github.com/apache/tvm tvm fi cd tvm mkdir -p build cp $CONFIG_PATH build/ cd build cmake .. make -j8 if [[ $OSTYPE == "darwin"* ]] then pip install tornado brew install openblas gfortran pip install pybind11 cython pythran conda install -y scipy pip install xgboost decorator export MACOSX_DEPLOYMENT_TARGET=10.9 else pip3 install decorator attrs tornado psutil xgboost cloudpickle fi cd ../python python3 setup.py install --user cd ../.. ================================================ FILE: optimization/nebullvm/nebullvm/installers/install_tvm_prerequisites.sh ================================================ #!/bin/bash # Set non interactive mode for apt-get export DEBIAN_FRONTEND=noninteractive if [[ $OSTYPE == "darwin"* ]] then brew install gcc git cmake #brew install llvm conda install -y -c conda-forge clangdev elif [[ "$(grep '^ID_LIKE' /etc/os-release)" == *"centos"* ]] then sudo yum update -y && sudo yum install -y gcc gcc-c++ llvm-devel cmake3 git if [ -f "/usr/bin/cmake" ] then sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 \ --slave /usr/local/bin/ctest ctest /usr/bin/ctest \ --slave /usr/local/bin/cpack cpack /usr/bin/cpack \ --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake \ --family cmake sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 \ --slave /usr/local/bin/ctest ctest /usr/bin/ctest3 \ --slave /usr/local/bin/cpack cpack /usr/bin/cpack3 \ --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3 \ --family cmake else sudo ln -s /usr/bin/cmake3 /usr/bin/cmake fi else sudo apt-get update && sudo apt-get install -y libpython3.8 gcc libtinfo-dev zlib1g-dev \ build-essential cmake libedit-dev libxml2-dev llvm-12 fi ================================================ FILE: optimization/nebullvm/nebullvm/installers/installers.py ================================================ import os import platform import subprocess import sys from abc import ABC from pathlib import Path from typing import List import cpuinfo from loguru import logger from nebullvm.config import LIBRARIES_GPU from nebullvm.operations.optimizations.compilers.utils import ( deepsparse_is_available, get_faster_transformer_repo_path, intel_neural_compressor_is_available, openvino_is_available, tensorrt_is_available, torch_tensorrt_is_available, ) from nebullvm.optional_modules.torch import torch from nebullvm.tools.utils import check_module_version, gpu_is_available def get_cpu_arch(): arch = cpuinfo.get_cpu_info()["arch"].lower() if "x86" in arch: return "x86" else: return "arm" def _get_os(): return platform.system() def install_tvm( working_dir: str = None, ): """Helper function for installing ApacheTVM. This function needs some prerequisites for running, as a valid `git` installation and having MacOS or a Linux-distribution as OS. Args: working_dir (str, optional): The directory where the tvm repo will be cloned and installed. """ path = Path(__file__).parent # install pre-requisites installation_file_prerequisites = str( path / "install_tvm_prerequisites.sh" ) subprocess.run( ["bash", installation_file_prerequisites], cwd=working_dir or Path.home(), ) installation_file = str(path / "install_tvm.sh") hardware_config = get_cpu_arch() if gpu_is_available(): hardware_config = f"{hardware_config}_cuda" env_dict = { "CONFIG_PATH": str( path / f"tvm_installers/{hardware_config}/config.cmake" ), **dict(os.environ.copy()), } subprocess.run( ["bash", installation_file], cwd=working_dir or Path.home(), env=env_dict, ) try: import tvm # noqa F401 except ImportError: return True return True def install_bladedisc(): """Helper function for installing BladeDisc.""" has_cuda = False if gpu_is_available(): has_cuda = True path = Path(__file__).parent installation_file = str(path / "install_bladedisc.sh") subprocess.Popen(["bash", installation_file, str(has_cuda).lower()]) try: import torch_blade # noqa F401 except ImportError: return False return True def install_torch_tensor_rt(): """Helper function for installing Torch-TensorRT. The function will install the software only if a cuda driver is available. """ if not gpu_is_available(): raise RuntimeError( "Torch-TensorRT can run just on Nvidia machines. " "No available cuda driver has been found." ) elif not check_module_version( torch, min_version="1.12.0", max_version="1.13.1+cu117" ): logger.warning( "Torch-TensorRT can be installed only for " "'PyTorch>=1.12, <=1.13.1'. Please update your Pytorch " "version accordingly if you want to use Torch-TensorRT." ) return False # Verify that TensorRT is installed, otherwise install it try: import tensorrt # noqa F401 except ImportError: install_tensor_rt() cmd = [ "pip3", "install", "torch-tensorrt", "--find-links", "https://github.com/pytorch/TensorRT/releases/expanded_assets/v1.3.0", ] subprocess.run(cmd) cuda_version = subprocess.check_output(["nvidia-smi"]) cuda_version = int( cuda_version.decode("utf-8") .split("\n")[2] .split("|")[-2] .split(":")[-1] .strip() .split(".")[0] ) if cuda_version >= 12: cmd = [ "pip3", "install", "tensorrt>=8.6.0,<=8.6.1", ] subprocess.run(cmd) try: import torch_tensorrt # noqa F401 except ImportError: return False return True def install_tf2onnx(): if _get_os() == "Darwin" and get_cpu_arch() == "arm": cmd = ["conda", "install", "-y", "tf2onnx>=1.8.4"] subprocess.run(cmd) else: cmd = ["pip3", "install", "--user", "protobuf<4,>=3.20.2"] subprocess.run(cmd) cmd = ["pip3", "install", "tf2onnx>=1.8.4"] subprocess.run(cmd) try: import tf2onnx # noqa F401 except ImportError: return False except AttributeError: # Sometimes the import could raise an attribute error # if installation fails pass return True def install_tensor_rt(): """Helper function for installing TensorRT. The function will install the software only if a cuda driver is available. """ if not gpu_is_available(): raise RuntimeError( "TensorRT can run just on Nvidia machines. " "No available cuda driver has been found." ) path = Path(__file__).parent installation_file = str(path / "install_tensor_rt.sh") subprocess.run(["bash", installation_file]) try: import polygraphy # noqa F401 import tensorrt # noqa F401 except ImportError: return False return True def install_openvino(with_optimization: bool = True): """Helper function for installing the OpenVino compiler. This function just works on intel machines. Args: with_optimization (bool): Flag for installing the full openvino engine or limiting the installation to the tools need for inference models. """ processor = cpuinfo.get_cpu_info()["brand_raw"].lower() if "intel" not in processor: raise RuntimeError( f"Openvino can run just on Intel machines. " f"You are trying to install it on {processor}" ) openvino_version = "openvino-dev" if with_optimization else "openvino" # If on windows if _get_os() == "Windows": cmd = ["pip3", "install", "--user", f"{openvino_version}>=2022.1.0"] else: cmd = ["pip3", "install", f"{openvino_version}>=2022.1.0"] subprocess.run(cmd) cmd = ["pip3", "install", "scipy>=1.7.3"] subprocess.run(cmd) try: from openvino.runtime import ( # noqa F401 CompiledModel, Core, InferRequest, Model, ) except ImportError: return False return True def install_onnxruntime(): """Helper function for installing the right version of onnxruntime.""" distribution_name = "onnxruntime" if gpu_is_available(): distribution_name = f"{distribution_name}-gpu" if _get_os() == "Darwin" and get_cpu_arch() == "arm": cmd = ["conda", "install", "-y", distribution_name] else: cmd = ["pip3", "install", distribution_name] subprocess.run(cmd) # install requirements for onnxruntime.transformers cmd = ["pip3", "install", "coloredlogs", "sympy"] subprocess.run(cmd) try: import onnxruntime # noqa F401 except ImportError: return False return True def install_deepsparse(): """Helper function for installing DeepSparse.""" python_minor_version = sys.version_info.minor os_ = platform.system() if os_ in ["Darwin", "Windows"] or get_cpu_arch() == "arm": raise RuntimeError( "DeepSparse is not supported on this platform. " "It won't be installed." ) try: cmd = ["apt-get", "install", f"python3.{python_minor_version}-venv"] subprocess.run(cmd) except Exception: pass cmd = ["pip3", "install", "deepsparse"] subprocess.run(cmd) try: cmd = ["pip3", "install", "numpy>=1.22.0,<1.24.0"] subprocess.run(cmd) except Exception: # For python 3.7 numpy 1.22.0 is not available pass try: from deepsparse import compile_model, cpu # noqa F401 except ImportError: return False return True def install_intel_neural_compressor(): """Helper function for installing Intel Neural Compressor.""" processor = cpuinfo.get_cpu_info()["brand_raw"].lower() if "intel" not in processor: raise RuntimeError( f"Intel Neural Compressor can run just on Intel machines. " f"You are trying to install it on {processor}" ) cmd = ["pip3", "install", "--user", "neural-compressor"] subprocess.run(cmd) try: from neural_compressor.experimental import ( # noqa F401 MixedPrecision, Quantization, ) except ImportError: return False return True def install_onnx_simplifier(): """Helper function for installing ONNX simplifier.""" if get_cpu_arch() != "arm": # Install onnx simplifier cmd = ["pip3", "install", "onnxsim"] subprocess.run(cmd) try: import onnxsim # noqa F401 except ImportError: return False return True def install_faster_transformer( working_dir: str = None, ): """Helper function for installing FasterTransformer. https://github.com/NVIDIA/FasterTransformer This function needs some prerequisites for running, as a valid `git` installation and having MacOS or a Linux-distribution as OS. Args: working_dir (str, optional): The directory where the FasterTransformer repo will be cloned and installed. Default: None """ if not gpu_is_available(): return False path = Path(__file__).parent # install faster transformer try: import torch CP = compute_capability = torch.cuda.get_device_capability() assert len(compute_capability) == 2 except (ImportError, AssertionError): return False installation_file = str(path / "install_fastertransformer.sh") env_dict = { "COMPUTE_CAPABILITY": f"{CP[0]}{CP[1]}", **dict(os.environ.copy()), } result = subprocess.run( ["bash", installation_file], cwd=get_faster_transformer_repo_path().parent, env=env_dict, ) # check result if result.returncode != 0: return False return True class BaseInstaller(ABC): def __init__(self, module_list: List[str]): self.modules = module_list def install_compilers( self, include_libraries: List[str], ): for library in self.modules: if ( isinstance(include_libraries, List) and library not in include_libraries ) or (not gpu_is_available() and library in LIBRARIES_GPU): continue logger.info(f"Trying to install {library} on the platform...") try: if not COMPILERS_AVAILABLE[library](): install_ok = COMPILER_INSTALLERS[library]() else: install_ok = True except Exception: install_ok = False if not install_ok: logger.warning( f"Unable to install {library} on this platform. " f"The compiler will be skipped. " ) else: logger.info(f"{library} installed successfully!") @staticmethod def install_dependencies(include_framework: List[str]): raise NotImplementedError @staticmethod def check_framework(): raise NotImplementedError @staticmethod def install_framework(): raise NotImplementedError class PytorchInstaller(BaseInstaller): @staticmethod def install_dependencies(include_framework: List[str]): return @staticmethod def check_framework(): try: import torch # noqa F401 except ImportError: raise ImportError( "No PyTorch found in your python environment. Please install " "it from https://pytorch.org/get-started/locally/." ) if not check_module_version( torch, min_version="1.12.0", max_version="2.0.1+cu118" ): logger.warning( "PyTorch version is not supported. Please install " "PyTorch >= 1.12.0 and <= 2.0.1." ) return True @staticmethod def install_framework(): cmd = ["pip3", "install", "torch>=1.12.0, <=2.0.1"] subprocess.run(cmd) try: import torch # noqa F401 except ImportError: return False return True class TensorflowInstaller(BaseInstaller): @staticmethod def install_dependencies(include_framework: List[str]): if "onnx" in include_framework: install_tf2onnx() @staticmethod def check_framework(): try: import tensorflow # noqa F401 except ImportError: return False if not check_module_version( tensorflow, min_version="2.7.0", max_version="2.12.0" ): logger.warning( "TensorFlow version is not supported. Please install " "TensorFlow >= 2.7.0 and <= 2.12.0." ) return False return True @staticmethod def install_framework(): if _get_os() == "Darwin" and get_cpu_arch() == "arm": cmd = [ "conda", "install", "-y", "tensorflow>=2.7.0, 2.12.0", "numpy<1.24", ] subprocess.run(cmd) else: cmd = ["pip3", "install", "--user", "tensorflow>=2.7.0, <=2.12.0"] subprocess.run(cmd) try: import tensorflow # noqa F401 except ImportError: return False return True class ONNXInstaller(BaseInstaller): @staticmethod def install_dependencies(include_framework: List[str]): install_onnxruntime() cmd = ["pip3", "install", "onnxmltools>=1.11.0"] subprocess.run(cmd) install_onnx_simplifier() @staticmethod def check_framework(): try: import onnx # noqa F401 except ImportError: return False if not check_module_version( onnx, min_version="1.10.0", max_version="1.14.0" ): logger.warning( "ONNX version is not supported. Please install " "ONNX >= 1.10.0 and <= 1.14.0." ) return False return True @staticmethod def install_framework(): if _get_os() == "Darwin" and get_cpu_arch() == "arm": cmd = ["pip3", "install", "cmake"] subprocess.run(cmd) cmd = ["pip3", "install", "onnx>=1.10.0, <=1.14.0"] subprocess.run(cmd) try: import onnx # noqa F401 except ImportError: return False return True class HuggingFaceInstaller(BaseInstaller): @staticmethod def install_dependencies(include_framework: List[str]): pass @staticmethod def check_framework(): try: import transformers # noqa F401 except ImportError: return False return True @staticmethod def install_framework(): cmd = ["pip3", "install", "transformers<=4.28.0"] subprocess.run(cmd) try: import transformers # noqa F401 except ImportError: return False return True class DiffusersInstaller(BaseInstaller): @staticmethod def install_dependencies(include_framework: List[str]): cmd = ["pip3", "install", "transformers<=4.28.0"] subprocess.run(cmd) if gpu_is_available(): cmd = ["pip3", "install", "cuda-python"] subprocess.run(cmd) cmd = ["pip3", "install", "onnx>=1.10.0, <=1.14.0"] subprocess.run(cmd) cmd = [ "pip3", "install", "onnx_graphsurgeon", "--index-url", "https://pypi.ngc.nvidia.com", ] subprocess.run(cmd) @staticmethod def check_framework(): try: import diffusers # noqa F401 except ImportError: return False if not check_module_version(diffusers, min_version="0.13.0"): return False return True @staticmethod def install_framework(): cmd = ["pip3", "install", "diffusers>=0.13.0, <=0.15.0"] subprocess.run(cmd) try: import diffusers # noqa F401 except ImportError: return False return True COMPILER_INSTALLERS = { "openvino": install_openvino, "tensor_rt": install_tensor_rt, "torch_tensor_rt": install_torch_tensor_rt, "deepsparse": install_deepsparse, "intel_neural_compressor": install_intel_neural_compressor, # "faster_transformer": install_faster_transformer, } COMPILERS_AVAILABLE = { "openvino": openvino_is_available, "tensor_rt": tensorrt_is_available, "torch_tensor_rt": torch_tensorrt_is_available, "deepsparse": deepsparse_is_available, "intel_neural_compressor": intel_neural_compressor_is_available, # "faster_transformer": faster_transformer_is_available, } ================================================ FILE: optimization/nebullvm/nebullvm/installers/tests/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/installers/tests/test_install_frameworks.py ================================================ from nebullvm.installers.auto_installer import ( select_frameworks_to_install, select_compilers_to_install, ) def test_install_default_option(): include_frameworks = "all" include_backends = "all" include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == [ "diffusers", "huggingface", "onnx", "tensorflow", "torch", ] def test_install_torch_full(): include_frameworks = ["torch"] include_backends = "all" include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["onnx", "torch"] def test_install_torch_base(): include_frameworks = ["torch"] include_backends = [] include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["torch"] def test_install_tensorflow_full(): include_frameworks = ["tensorflow"] include_backends = "all" include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["onnx", "tensorflow"] def test_install_tensorflow_base(): include_frameworks = ["tensorflow"] include_backends = [] include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["tensorflow"] def test_install_onnx_full(): include_frameworks = ["onnx"] include_backends = "all" include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["onnx"] def test_install_onnx_base(): include_frameworks = ["onnx"] include_backends = [] include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["onnx"] def test_install_diffusers_full(): include_frameworks = ["diffusers"] include_backends = "all" include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["diffusers", "onnx", "torch"] def test_install_huggingface_full(): include_frameworks = ["huggingface"] include_backends = "all" include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["huggingface", "onnx", "tensorflow", "torch"] def test_install_huggingface_full_tf(): include_frameworks = ["huggingface"] include_backends = ["onnx", "tensorflow"] include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["huggingface", "onnx", "tensorflow"] def test_install_huggingface_full_torch(): include_frameworks = ["huggingface"] include_backends = ["onnx", "torch"] include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["huggingface", "onnx", "torch"] def test_install_huggingface_tf(): include_frameworks = ["huggingface"] include_backends = ["tensorflow"] include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["huggingface", "tensorflow"] def test_install_huggingface_torch(): include_frameworks = ["huggingface"] include_backends = ["torch"] include_backends = select_frameworks_to_install( include_frameworks, include_backends ) assert include_backends == ["huggingface", "torch"] def test_install_huggingface_compilers_all(): framework_list = ["huggingface"] include_compilers = "all" compiler_list = select_compilers_to_install( include_compilers, framework_list ) assert compiler_list == [] def test_install_huggingface_torch_compilers_all(): framework_list = ["huggingface", "torch"] include_compilers = "all" compiler_list = select_compilers_to_install( include_compilers, framework_list ) assert compiler_list == [ "deepsparse", "faster_transformer", "intel_neural_compressor", "tensor_rt", "torch_tensor_rt", ] def test_install_torch_compilers_all(): framework_list = ["torch"] include_compilers = "all" compiler_list = select_compilers_to_install( include_compilers, framework_list ) assert compiler_list == [ "deepsparse", "faster_transformer", "intel_neural_compressor", "tensor_rt", "torch_tensor_rt", ] def test_install_torch_compilers_deepsparse(): framework_list = ["torch"] include_compilers = ["deepsparse"] compiler_list = select_compilers_to_install( include_compilers, framework_list ) assert compiler_list == ["deepsparse"] def test_install_torch_compilers_invalid(): framework_list = ["torch"] include_compilers = ["best_compiler"] compiler_list = select_compilers_to_install( include_compilers, framework_list ) assert compiler_list == [] def test_install_torch_onnx_compilers_all(): framework_list = ["torch", "onnx"] include_compilers = "all" compiler_list = select_compilers_to_install( include_compilers, framework_list ) assert compiler_list == [ "deepsparse", "faster_transformer", "intel_neural_compressor", "openvino", "tensor_rt", "torch_tensor_rt", ] def test_install_tensorflow_compilers_all(): framework_list = ["tensorflow"] include_compilers = "all" compiler_list = select_compilers_to_install( include_compilers, framework_list ) assert compiler_list == [] ================================================ FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/arm/config.cmake ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #-------------------------------------------------------------------- # Template custom cmake configuration for compiling # # This file is used to override the build options in build. # If you want to change the configuration, please use the following # steps. Assume you are on the root directory. First copy the this # file so that any local changes will be ignored by git # # $ mkdir build # $ cp cmake/config.cmake build # # Next modify the according entries, and then compile by # # $ cd build # $ cmake .. # # Then build in parallel with 8 threads # # $ make -j8 #-------------------------------------------------------------------- #--------------------------------------------- # Backend runtimes. #--------------------------------------------- # Whether enable CUDA during compile, # # Possible values: # - ON: enable CUDA with cmake's auto search # - OFF: disable CUDA # - /path/to/cuda: use specific path to cuda toolkit set(USE_CUDA OFF) # Whether enable ROCM runtime # # Possible values: # - ON: enable ROCM with cmake's auto search # - OFF: disable ROCM # - /path/to/rocm: use specific path to rocm set(USE_ROCM OFF) # Whether enable SDAccel runtime set(USE_SDACCEL OFF) # Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime set(USE_AOCL OFF) # Whether enable OpenCL runtime # # Possible values: # - ON: enable OpenCL with cmake's auto search # - OFF: disable OpenCL # - /path/to/opencl-sdk: use specific path to opencl-sdk set(USE_OPENCL OFF) # Whether enable Metal runtime set(USE_METAL OFF) # Whether enable Vulkan runtime # # Possible values: # - ON: enable Vulkan with cmake's auto search # - OFF: disable vulkan # - /path/to/vulkan-sdk: use specific path to vulkan-sdk set(USE_VULKAN OFF) # Whether enable OpenGL runtime set(USE_OPENGL OFF) # Whether enable MicroTVM runtime set(USE_MICRO OFF) # Whether enable RPC runtime set(USE_RPC ON) # Whether to build the C++ RPC server binary set(USE_CPP_RPC OFF) # Whether to build the iOS RPC server application set(USE_IOS_RPC OFF) # Whether embed stackvm into the runtime set(USE_STACKVM_RUNTIME OFF) # Whether enable tiny embedded graph executor. set(USE_GRAPH_EXECUTOR ON) # Whether enable tiny graph executor with CUDA Graph set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF) # Whether enable pipeline executor. set(USE_PIPELINE_EXECUTOR OFF) # Whether to enable the profiler for the graph executor and vm set(USE_PROFILER ON) # Whether enable microTVM standalone runtime set(USE_MICRO_STANDALONE_RUNTIME OFF) # Whether build with LLVM support # Requires LLVM version >= 4.0 # # Possible values: # - ON: enable llvm with cmake's find search # - OFF: disable llvm, note this will disable CPU codegen # which is needed for most cases # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available. set(USE_LLVM ON) #--------------------------------------------- # Contrib libraries #--------------------------------------------- # Whether to build with BYODT software emulated posit custom datatype # # Possible values: # - ON: enable BYODT posit, requires setting UNIVERSAL_PATH # - OFF: disable BYODT posit # # set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON set(USE_BYODT_POSIT OFF) # Whether use BLAS, choices: openblas, atlas, apple set(USE_BLAS none) # Whether to use MKL # Possible values: # - ON: Enable MKL # - /path/to/mkl: mkl root path # - OFF: Disable MKL # set(USE_MKL /opt/intel/mkl) for UNIX # set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32 # set(USE_MKL ) if using `pip install mkl` set(USE_MKL OFF) # Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library set(USE_MKLDNN OFF) # Whether use OpenMP thread pool, choices: gnu, intel # Note: "gnu" uses gomp library, "intel" uses iomp5 library set(USE_OPENMP none) # Whether use contrib.random in runtime set(USE_RANDOM ON) # Whether use NNPack set(USE_NNPACK OFF) # Possible values: # - ON: enable tflite with cmake's find search # - OFF: disable tflite # - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library set(USE_TFLITE OFF) # /path/to/tensorflow: tensorflow root path when use tflite library set(USE_TENSORFLOW_PATH none) # Required for full builds with TFLite. Not needed for runtime with TFLite. # /path/to/flatbuffers: flatbuffers root path when using tflite library set(USE_FLATBUFFERS_PATH none) # Possible values: # - OFF: disable tflite support for edgetpu # - /path/to/edgetpu: use specific path to edgetpu library set(USE_EDGETPU OFF) # Possible values: # - ON: enable cuDNN with cmake's auto search in CUDA directory # - OFF: disable cuDNN # - /path/to/cudnn: use specific path to cuDNN path set(USE_CUDNN OFF) # Whether use cuBLAS set(USE_CUBLAS OFF) # Whether use MIOpen set(USE_MIOPEN OFF) # Whether use MPS set(USE_MPS OFF) # Whether use rocBlas set(USE_ROCBLAS OFF) # Whether use contrib sort set(USE_SORT ON) # Whether use MKL-DNN (DNNL) codegen set(USE_DNNL_CODEGEN OFF) # Whether to use Arm Compute Library (ACL) codegen # We provide 2 separate flags since we cannot build the ACL runtime on x86. # This is useful for cases where you want to cross-compile a relay graph # on x86 then run on AArch. # # An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst. # # USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported # operators to Arm Compute Library. OFF/ON # USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL # runtime. OFF/ON/"path/to/ACL" set(USE_ARM_COMPUTE_LIB OFF) set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF) # Whether to build with Arm Ethos-N support # Possible values: # - OFF: disable Arm Ethos-N support # - path/to/arm-ethos-N-stack: use a specific version of the # Ethos-N driver stack set(USE_ETHOSN OFF) # If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine # otherwise use ETHOSN_HW (OFF) to use the software test infrastructure set(USE_ETHOSN_HW OFF) # Whether to build with Arm(R) Ethos(TM)-U NPU codegen support set(USE_ETHOSU OFF) # Whether to build with TensorRT codegen or runtime # Examples are available here: docs/deploy/tensorrt.rst. # # USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are # offloaded to TensorRT. OFF/ON # USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of # TensorRT library. OFF/ON/"path/to/TensorRT" set(USE_TENSORRT_CODEGEN OFF) set(USE_TENSORRT_RUNTIME OFF) # Whether use VITIS-AI codegen set(USE_VITIS_AI OFF) # Build Verilator codegen and runtime set(USE_VERILATOR OFF) # Build ANTLR parser for Relay text format # Possible values: # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar) # - OFF: disable ANTLR # - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file set(USE_ANTLR OFF) # Whether use Relay debug mode set(USE_RELAY_DEBUG OFF) # Whether to build fast VTA simulator driver set(USE_VTA_FSIM OFF) # Whether to build cycle-accurate VTA simulator driver set(USE_VTA_TSIM OFF) # Whether to build VTA FPGA driver (device side only) set(USE_VTA_FPGA OFF) # Whether use Thrust set(USE_THRUST OFF) # Whether to build the TensorFlow TVMDSOOp module set(USE_TF_TVMDSOOP OFF) # Whether to build the PyTorch custom class module set(USE_PT_TVMDSOOP OFF) # Whether to use STL's std::unordered_map or TVM's POD compatible Map set(USE_FALLBACK_STL_MAP OFF) # Whether to use hexagon device set(USE_HEXAGON_DEVICE OFF) set(USE_HEXAGON_SDK /path/to/sdk) # Whether to build the hexagon launcher set(USE_HEXAGON_LAUNCHER OFF) # Hexagon architecture to target when compiling TVM itself (not the target for # compiling _by_ TVM). This applies to components like the TVM runtime, but is # also used to select correct include/library paths from the Hexagon SDK when # building offloading runtime for Android. # Valid values are v60, v62, v65, v66, v68. set(USE_HEXAGON_ARCH "v66") # Whether to use ONNX codegen set(USE_TARGET_ONNX OFF) # Whether enable BNNS runtime set(USE_BNNS OFF) # Whether to use libbacktrace # Libbacktrace provides line and column information on stack traces from errors. # It is only supported on linux and macOS. # Possible values: # - AUTO: auto set according to system information and feasibility # - ON: enable libbacktrace # - OFF: disable libbacktrace set(USE_LIBBACKTRACE AUTO) # Whether to build static libtvm_runtime.a, the default is to build the dynamic # version: libtvm_runtime.so. # # The static runtime library needs to be linked into executables with the linker # option --whole-archive (or its equivalent). The reason is that the TVM registry # mechanism relies on global constructors being executed at program startup. # Global constructors alone are not sufficient for the linker to consider a # library member to be used, and some of such library members (object files) may # not be included in the final executable. This would make the corresponding # runtime functions to be unavailable to the program. set(BUILD_STATIC_RUNTIME OFF) # Caches the build so that building is faster when switching between branches. # If you switch branches, build and then encounter a linking error, you may # need to regenerate the build tree through "make .." (the cache will # still provide significant speedups). # Possible values: # - AUTO: search for path to ccache, disable if not found. # - ON: enable ccache by searching for the path to ccache, report an error if not found # - OFF: disable ccache # - /path/to/ccache: use specific path to ccache set(USE_CCACHE AUTO) # Whether to enable PAPI support in profiling. PAPI provides access to hardware # counters while profiling. # Possible values: # - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc # - OFF: disable PAPI support. # - /path/to/folder/containing/: Path to folder containing papi.pc. set(USE_PAPI OFF) # Whether to use GoogleTest for C++ unit tests. When enabled, the generated # build file (e.g. Makefile) will have a target "cpptest". # Possible values: # - ON: enable GoogleTest. The package `GTest` will be required for cmake # to succeed. # - OFF: disable GoogleTest. # - AUTO: cmake will attempt to find the GTest package, if found GTest will # be enabled, otherwise it will be disabled. # Note that cmake will use `find_package` to find GTest. Please use cmake's # predefined variables to specify the path to the GTest package if needed. set(USE_GTEST AUTO) # Enable using CUTLASS as a BYOC backend # Need to have USE_CUDA=ON set(USE_CUTLASS OFF) ================================================ FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/arm_cuda/config.cmake ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #-------------------------------------------------------------------- # Template custom cmake configuration for compiling # # This file is used to override the build options in build. # If you want to change the configuration, please use the following # steps. Assume you are on the root directory. First copy the this # file so that any local changes will be ignored by git # # $ mkdir build # $ cp cmake/config.cmake build # # Next modify the according entries, and then compile by # # $ cd build # $ cmake .. # # Then build in parallel with 8 threads # # $ make -j8 #-------------------------------------------------------------------- #--------------------------------------------- # Backend runtimes. #--------------------------------------------- # Whether enable CUDA during compile, # # Possible values: # - ON: enable CUDA with cmake's auto search # - OFF: disable CUDA # - /path/to/cuda: use specific path to cuda toolkit set(USE_CUDA ON) # Whether enable ROCM runtime # # Possible values: # - ON: enable ROCM with cmake's auto search # - OFF: disable ROCM # - /path/to/rocm: use specific path to rocm set(USE_ROCM OFF) # Whether enable SDAccel runtime set(USE_SDACCEL OFF) # Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime set(USE_AOCL OFF) # Whether enable OpenCL runtime # # Possible values: # - ON: enable OpenCL with cmake's auto search # - OFF: disable OpenCL # - /path/to/opencl-sdk: use specific path to opencl-sdk set(USE_OPENCL OFF) # Whether enable Metal runtime set(USE_METAL OFF) # Whether enable Vulkan runtime # # Possible values: # - ON: enable Vulkan with cmake's auto search # - OFF: disable vulkan # - /path/to/vulkan-sdk: use specific path to vulkan-sdk set(USE_VULKAN OFF) # Whether enable OpenGL runtime set(USE_OPENGL OFF) # Whether enable MicroTVM runtime set(USE_MICRO OFF) # Whether enable RPC runtime set(USE_RPC ON) # Whether to build the C++ RPC server binary set(USE_CPP_RPC OFF) # Whether to build the iOS RPC server application set(USE_IOS_RPC OFF) # Whether embed stackvm into the runtime set(USE_STACKVM_RUNTIME OFF) # Whether enable tiny embedded graph executor. set(USE_GRAPH_EXECUTOR ON) # Whether enable tiny graph executor with CUDA Graph set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF) # Whether enable pipeline executor. set(USE_PIPELINE_EXECUTOR OFF) # Whether to enable the profiler for the graph executor and vm set(USE_PROFILER ON) # Whether enable microTVM standalone runtime set(USE_MICRO_STANDALONE_RUNTIME OFF) # Whether build with LLVM support # Requires LLVM version >= 4.0 # # Possible values: # - ON: enable llvm with cmake's find search # - OFF: disable llvm, note this will disable CPU codegen # which is needed for most cases # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available. set(USE_LLVM ON) #--------------------------------------------- # Contrib libraries #--------------------------------------------- # Whether to build with BYODT software emulated posit custom datatype # # Possible values: # - ON: enable BYODT posit, requires setting UNIVERSAL_PATH # - OFF: disable BYODT posit # # set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON set(USE_BYODT_POSIT OFF) # Whether use BLAS, choices: openblas, atlas, apple set(USE_BLAS none) # Whether to use MKL # Possible values: # - ON: Enable MKL # - /path/to/mkl: mkl root path # - OFF: Disable MKL # set(USE_MKL /opt/intel/mkl) for UNIX # set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32 # set(USE_MKL ) if using `pip install mkl` set(USE_MKL OFF) # Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library set(USE_MKLDNN OFF) # Whether use OpenMP thread pool, choices: gnu, intel # Note: "gnu" uses gomp library, "intel" uses iomp5 library set(USE_OPENMP none) # Whether use contrib.random in runtime set(USE_RANDOM ON) # Whether use NNPack set(USE_NNPACK OFF) # Possible values: # - ON: enable tflite with cmake's find search # - OFF: disable tflite # - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library set(USE_TFLITE OFF) # /path/to/tensorflow: tensorflow root path when use tflite library set(USE_TENSORFLOW_PATH none) # Required for full builds with TFLite. Not needed for runtime with TFLite. # /path/to/flatbuffers: flatbuffers root path when using tflite library set(USE_FLATBUFFERS_PATH none) # Possible values: # - OFF: disable tflite support for edgetpu # - /path/to/edgetpu: use specific path to edgetpu library set(USE_EDGETPU OFF) # Possible values: # - ON: enable cuDNN with cmake's auto search in CUDA directory # - OFF: disable cuDNN # - /path/to/cudnn: use specific path to cuDNN path set(USE_CUDNN OFF) # Whether use cuBLAS set(USE_CUBLAS OFF) # Whether use MIOpen set(USE_MIOPEN OFF) # Whether use MPS set(USE_MPS OFF) # Whether use rocBlas set(USE_ROCBLAS OFF) # Whether use contrib sort set(USE_SORT ON) # Whether use MKL-DNN (DNNL) codegen set(USE_DNNL_CODEGEN OFF) # Whether to use Arm Compute Library (ACL) codegen # We provide 2 separate flags since we cannot build the ACL runtime on x86. # This is useful for cases where you want to cross-compile a relay graph # on x86 then run on AArch. # # An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst. # # USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported # operators to Arm Compute Library. OFF/ON # USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL # runtime. OFF/ON/"path/to/ACL" set(USE_ARM_COMPUTE_LIB OFF) set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF) # Whether to build with Arm Ethos-N support # Possible values: # - OFF: disable Arm Ethos-N support # - path/to/arm-ethos-N-stack: use a specific version of the # Ethos-N driver stack set(USE_ETHOSN OFF) # If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine # otherwise use ETHOSN_HW (OFF) to use the software test infrastructure set(USE_ETHOSN_HW OFF) # Whether to build with Arm(R) Ethos(TM)-U NPU codegen support set(USE_ETHOSU OFF) # Whether to build with TensorRT codegen or runtime # Examples are available here: docs/deploy/tensorrt.rst. # # USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are # offloaded to TensorRT. OFF/ON # USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of # TensorRT library. OFF/ON/"path/to/TensorRT" set(USE_TENSORRT_CODEGEN OFF) set(USE_TENSORRT_RUNTIME OFF) # Whether use VITIS-AI codegen set(USE_VITIS_AI OFF) # Build Verilator codegen and runtime set(USE_VERILATOR OFF) # Build ANTLR parser for Relay text format # Possible values: # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar) # - OFF: disable ANTLR # - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file set(USE_ANTLR OFF) # Whether use Relay debug mode set(USE_RELAY_DEBUG OFF) # Whether to build fast VTA simulator driver set(USE_VTA_FSIM OFF) # Whether to build cycle-accurate VTA simulator driver set(USE_VTA_TSIM OFF) # Whether to build VTA FPGA driver (device side only) set(USE_VTA_FPGA OFF) # Whether use Thrust set(USE_THRUST OFF) # Whether to build the TensorFlow TVMDSOOp module set(USE_TF_TVMDSOOP OFF) # Whether to build the PyTorch custom class module set(USE_PT_TVMDSOOP OFF) # Whether to use STL's std::unordered_map or TVM's POD compatible Map set(USE_FALLBACK_STL_MAP OFF) # Whether to use hexagon device set(USE_HEXAGON_DEVICE OFF) set(USE_HEXAGON_SDK /path/to/sdk) # Whether to build the hexagon launcher set(USE_HEXAGON_LAUNCHER OFF) # Hexagon architecture to target when compiling TVM itself (not the target for # compiling _by_ TVM). This applies to components like the TVM runtime, but is # also used to select correct include/library paths from the Hexagon SDK when # building offloading runtime for Android. # Valid values are v60, v62, v65, v66, v68. set(USE_HEXAGON_ARCH "v66") # Whether to use ONNX codegen set(USE_TARGET_ONNX OFF) # Whether enable BNNS runtime set(USE_BNNS OFF) # Whether to use libbacktrace # Libbacktrace provides line and column information on stack traces from errors. # It is only supported on linux and macOS. # Possible values: # - AUTO: auto set according to system information and feasibility # - ON: enable libbacktrace # - OFF: disable libbacktrace set(USE_LIBBACKTRACE AUTO) # Whether to build static libtvm_runtime.a, the default is to build the dynamic # version: libtvm_runtime.so. # # The static runtime library needs to be linked into executables with the linker # option --whole-archive (or its equivalent). The reason is that the TVM registry # mechanism relies on global constructors being executed at program startup. # Global constructors alone are not sufficient for the linker to consider a # library member to be used, and some of such library members (object files) may # not be included in the final executable. This would make the corresponding # runtime functions to be unavailable to the program. set(BUILD_STATIC_RUNTIME OFF) # Caches the build so that building is faster when switching between branches. # If you switch branches, build and then encounter a linking error, you may # need to regenerate the build tree through "make .." (the cache will # still provide significant speedups). # Possible values: # - AUTO: search for path to ccache, disable if not found. # - ON: enable ccache by searching for the path to ccache, report an error if not found # - OFF: disable ccache # - /path/to/ccache: use specific path to ccache set(USE_CCACHE AUTO) # Whether to enable PAPI support in profiling. PAPI provides access to hardware # counters while profiling. # Possible values: # - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc # - OFF: disable PAPI support. # - /path/to/folder/containing/: Path to folder containing papi.pc. set(USE_PAPI OFF) # Whether to use GoogleTest for C++ unit tests. When enabled, the generated # build file (e.g. Makefile) will have a target "cpptest". # Possible values: # - ON: enable GoogleTest. The package `GTest` will be required for cmake # to succeed. # - OFF: disable GoogleTest. # - AUTO: cmake will attempt to find the GTest package, if found GTest will # be enabled, otherwise it will be disabled. # Note that cmake will use `find_package` to find GTest. Please use cmake's # predefined variables to specify the path to the GTest package if needed. set(USE_GTEST AUTO) # Enable using CUTLASS as a BYOC backend # Need to have USE_CUDA=ON set(USE_CUTLASS OFF) ================================================ FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/x86/config.cmake ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #-------------------------------------------------------------------- # Template custom cmake configuration for compiling # # This file is used to override the build options in build. # If you want to change the configuration, please use the following # steps. Assume you are on the root directory. First copy the this # file so that any local changes will be ignored by git # # $ mkdir build # $ cp cmake/config.cmake build # # Next modify the according entries, and then compile by # # $ cd build # $ cmake .. # # Then build in parallel with 8 threads # # $ make -j8 #-------------------------------------------------------------------- #--------------------------------------------- # Backend runtimes. #--------------------------------------------- # Whether enable CUDA during compile, # # Possible values: # - ON: enable CUDA with cmake's auto search # - OFF: disable CUDA # - /path/to/cuda: use specific path to cuda toolkit set(USE_CUDA OFF) # Whether enable ROCM runtime # # Possible values: # - ON: enable ROCM with cmake's auto search # - OFF: disable ROCM # - /path/to/rocm: use specific path to rocm set(USE_ROCM OFF) # Whether enable SDAccel runtime set(USE_SDACCEL OFF) # Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime set(USE_AOCL OFF) # Whether enable OpenCL runtime # # Possible values: # - ON: enable OpenCL with cmake's auto search # - OFF: disable OpenCL # - /path/to/opencl-sdk: use specific path to opencl-sdk set(USE_OPENCL OFF) # Whether enable Metal runtime set(USE_METAL OFF) # Whether enable Vulkan runtime # # Possible values: # - ON: enable Vulkan with cmake's auto search # - OFF: disable vulkan # - /path/to/vulkan-sdk: use specific path to vulkan-sdk set(USE_VULKAN OFF) # Whether enable OpenGL runtime set(USE_OPENGL OFF) # Whether enable MicroTVM runtime set(USE_MICRO OFF) # Whether enable RPC runtime set(USE_RPC ON) # Whether to build the C++ RPC server binary set(USE_CPP_RPC OFF) # Whether to build the iOS RPC server application set(USE_IOS_RPC OFF) # Whether embed stackvm into the runtime set(USE_STACKVM_RUNTIME OFF) # Whether enable tiny embedded graph executor. set(USE_GRAPH_EXECUTOR ON) # Whether enable tiny graph executor with CUDA Graph set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF) # Whether enable pipeline executor. set(USE_PIPELINE_EXECUTOR OFF) # Whether to enable the profiler for the graph executor and vm set(USE_PROFILER ON) # Whether enable microTVM standalone runtime set(USE_MICRO_STANDALONE_RUNTIME OFF) # Whether build with LLVM support # Requires LLVM version >= 4.0 # # Possible values: # - ON: enable llvm with cmake's find search # - OFF: disable llvm, note this will disable CPU codegen # which is needed for most cases # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available. set(USE_LLVM ON) #--------------------------------------------- # Contrib libraries #--------------------------------------------- # Whether to build with BYODT software emulated posit custom datatype # # Possible values: # - ON: enable BYODT posit, requires setting UNIVERSAL_PATH # - OFF: disable BYODT posit # # set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON set(USE_BYODT_POSIT OFF) # Whether use BLAS, choices: openblas, atlas, apple set(USE_BLAS none) # Whether to use MKL # Possible values: # - ON: Enable MKL # - /path/to/mkl: mkl root path # - OFF: Disable MKL # set(USE_MKL /opt/intel/mkl) for UNIX # set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32 # set(USE_MKL ) if using `pip install mkl` set(USE_MKL OFF) # Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library set(USE_MKLDNN OFF) # Whether use OpenMP thread pool, choices: gnu, intel # Note: "gnu" uses gomp library, "intel" uses iomp5 library set(USE_OPENMP none) # Whether use contrib.random in runtime set(USE_RANDOM ON) # Whether use NNPack set(USE_NNPACK OFF) # Possible values: # - ON: enable tflite with cmake's find search # - OFF: disable tflite # - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library set(USE_TFLITE OFF) # /path/to/tensorflow: tensorflow root path when use tflite library set(USE_TENSORFLOW_PATH none) # Required for full builds with TFLite. Not needed for runtime with TFLite. # /path/to/flatbuffers: flatbuffers root path when using tflite library set(USE_FLATBUFFERS_PATH none) # Possible values: # - OFF: disable tflite support for edgetpu # - /path/to/edgetpu: use specific path to edgetpu library set(USE_EDGETPU OFF) # Possible values: # - ON: enable cuDNN with cmake's auto search in CUDA directory # - OFF: disable cuDNN # - /path/to/cudnn: use specific path to cuDNN path set(USE_CUDNN OFF) # Whether use cuBLAS set(USE_CUBLAS OFF) # Whether use MIOpen set(USE_MIOPEN OFF) # Whether use MPS set(USE_MPS OFF) # Whether use rocBlas set(USE_ROCBLAS OFF) # Whether use contrib sort set(USE_SORT ON) # Whether use MKL-DNN (DNNL) codegen set(USE_DNNL_CODEGEN OFF) # Whether to use Arm Compute Library (ACL) codegen # We provide 2 separate flags since we cannot build the ACL runtime on x86. # This is useful for cases where you want to cross-compile a relay graph # on x86 then run on AArch. # # An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst. # # USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported # operators to Arm Compute Library. OFF/ON # USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL # runtime. OFF/ON/"path/to/ACL" set(USE_ARM_COMPUTE_LIB OFF) set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF) # Whether to build with Arm Ethos-N support # Possible values: # - OFF: disable Arm Ethos-N support # - path/to/arm-ethos-N-stack: use a specific version of the # Ethos-N driver stack set(USE_ETHOSN OFF) # If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine # otherwise use ETHOSN_HW (OFF) to use the software test infrastructure set(USE_ETHOSN_HW OFF) # Whether to build with Arm(R) Ethos(TM)-U NPU codegen support set(USE_ETHOSU OFF) # Whether to build with TensorRT codegen or runtime # Examples are available here: docs/deploy/tensorrt.rst. # # USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are # offloaded to TensorRT. OFF/ON # USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of # TensorRT library. OFF/ON/"path/to/TensorRT" set(USE_TENSORRT_CODEGEN OFF) set(USE_TENSORRT_RUNTIME OFF) # Whether use VITIS-AI codegen set(USE_VITIS_AI OFF) # Build Verilator codegen and runtime set(USE_VERILATOR OFF) # Build ANTLR parser for Relay text format # Possible values: # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar) # - OFF: disable ANTLR # - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file set(USE_ANTLR OFF) # Whether use Relay debug mode set(USE_RELAY_DEBUG OFF) # Whether to build fast VTA simulator driver set(USE_VTA_FSIM OFF) # Whether to build cycle-accurate VTA simulator driver set(USE_VTA_TSIM OFF) # Whether to build VTA FPGA driver (device side only) set(USE_VTA_FPGA OFF) # Whether use Thrust set(USE_THRUST OFF) # Whether to build the TensorFlow TVMDSOOp module set(USE_TF_TVMDSOOP OFF) # Whether to build the PyTorch custom class module set(USE_PT_TVMDSOOP OFF) # Whether to use STL's std::unordered_map or TVM's POD compatible Map set(USE_FALLBACK_STL_MAP OFF) # Whether to use hexagon device set(USE_HEXAGON_DEVICE OFF) set(USE_HEXAGON_SDK /path/to/sdk) # Whether to build the hexagon launcher set(USE_HEXAGON_LAUNCHER OFF) # Hexagon architecture to target when compiling TVM itself (not the target for # compiling _by_ TVM). This applies to components like the TVM runtime, but is # also used to select correct include/library paths from the Hexagon SDK when # building offloading runtime for Android. # Valid values are v60, v62, v65, v66, v68. set(USE_HEXAGON_ARCH "v66") # Whether to use ONNX codegen set(USE_TARGET_ONNX OFF) # Whether enable BNNS runtime set(USE_BNNS OFF) # Whether to use libbacktrace # Libbacktrace provides line and column information on stack traces from errors. # It is only supported on linux and macOS. # Possible values: # - AUTO: auto set according to system information and feasibility # - ON: enable libbacktrace # - OFF: disable libbacktrace set(USE_LIBBACKTRACE AUTO) # Whether to build static libtvm_runtime.a, the default is to build the dynamic # version: libtvm_runtime.so. # # The static runtime library needs to be linked into executables with the linker # option --whole-archive (or its equivalent). The reason is that the TVM registry # mechanism relies on global constructors being executed at program startup. # Global constructors alone are not sufficient for the linker to consider a # library member to be used, and some of such library members (object files) may # not be included in the final executable. This would make the corresponding # runtime functions to be unavailable to the program. set(BUILD_STATIC_RUNTIME OFF) # Caches the build so that building is faster when switching between branches. # If you switch branches, build and then encounter a linking error, you may # need to regenerate the build tree through "make .." (the cache will # still provide significant speedups). # Possible values: # - AUTO: search for path to ccache, disable if not found. # - ON: enable ccache by searching for the path to ccache, report an error if not found # - OFF: disable ccache # - /path/to/ccache: use specific path to ccache set(USE_CCACHE AUTO) # Whether to enable PAPI support in profiling. PAPI provides access to hardware # counters while profiling. # Possible values: # - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc # - OFF: disable PAPI support. # - /path/to/folder/containing/: Path to folder containing papi.pc. set(USE_PAPI OFF) # Whether to use GoogleTest for C++ unit tests. When enabled, the generated # build file (e.g. Makefile) will have a target "cpptest". # Possible values: # - ON: enable GoogleTest. The package `GTest` will be required for cmake # to succeed. # - OFF: disable GoogleTest. # - AUTO: cmake will attempt to find the GTest package, if found GTest will # be enabled, otherwise it will be disabled. # Note that cmake will use `find_package` to find GTest. Please use cmake's # predefined variables to specify the path to the GTest package if needed. set(USE_GTEST AUTO) # Enable using CUTLASS as a BYOC backend # Need to have USE_CUDA=ON set(USE_CUTLASS OFF) ================================================ FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/x86_cuda/config.cmake ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #-------------------------------------------------------------------- # Template custom cmake configuration for compiling # # This file is used to override the build options in build. # If you want to change the configuration, please use the following # steps. Assume you are on the root directory. First copy the this # file so that any local changes will be ignored by git # # $ mkdir build # $ cp cmake/config.cmake build # # Next modify the according entries, and then compile by # # $ cd build # $ cmake .. # # Then build in parallel with 8 threads # # $ make -j8 #-------------------------------------------------------------------- #--------------------------------------------- # Backend runtimes. #--------------------------------------------- # Whether enable CUDA during compile, # # Possible values: # - ON: enable CUDA with cmake's auto search # - OFF: disable CUDA # - /path/to/cuda: use specific path to cuda toolkit set(USE_CUDA ON) # Whether enable ROCM runtime # # Possible values: # - ON: enable ROCM with cmake's auto search # - OFF: disable ROCM # - /path/to/rocm: use specific path to rocm set(USE_ROCM OFF) # Whether enable SDAccel runtime set(USE_SDACCEL OFF) # Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime set(USE_AOCL OFF) # Whether enable OpenCL runtime # # Possible values: # - ON: enable OpenCL with cmake's auto search # - OFF: disable OpenCL # - /path/to/opencl-sdk: use specific path to opencl-sdk set(USE_OPENCL OFF) # Whether enable Metal runtime set(USE_METAL OFF) # Whether enable Vulkan runtime # # Possible values: # - ON: enable Vulkan with cmake's auto search # - OFF: disable vulkan # - /path/to/vulkan-sdk: use specific path to vulkan-sdk set(USE_VULKAN OFF) # Whether enable OpenGL runtime set(USE_OPENGL OFF) # Whether enable MicroTVM runtime set(USE_MICRO OFF) # Whether enable RPC runtime set(USE_RPC ON) # Whether to build the C++ RPC server binary set(USE_CPP_RPC OFF) # Whether to build the iOS RPC server application set(USE_IOS_RPC OFF) # Whether embed stackvm into the runtime set(USE_STACKVM_RUNTIME OFF) # Whether enable tiny embedded graph executor. set(USE_GRAPH_EXECUTOR ON) # Whether enable tiny graph executor with CUDA Graph set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF) # Whether enable pipeline executor. set(USE_PIPELINE_EXECUTOR OFF) # Whether to enable the profiler for the graph executor and vm set(USE_PROFILER ON) # Whether enable microTVM standalone runtime set(USE_MICRO_STANDALONE_RUNTIME OFF) # Whether build with LLVM support # Requires LLVM version >= 4.0 # # Possible values: # - ON: enable llvm with cmake's find search # - OFF: disable llvm, note this will disable CPU codegen # which is needed for most cases # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available. set(USE_LLVM ON) #--------------------------------------------- # Contrib libraries #--------------------------------------------- # Whether to build with BYODT software emulated posit custom datatype # # Possible values: # - ON: enable BYODT posit, requires setting UNIVERSAL_PATH # - OFF: disable BYODT posit # # set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON set(USE_BYODT_POSIT OFF) # Whether use BLAS, choices: openblas, atlas, apple set(USE_BLAS none) # Whether to use MKL # Possible values: # - ON: Enable MKL # - /path/to/mkl: mkl root path # - OFF: Disable MKL # set(USE_MKL /opt/intel/mkl) for UNIX # set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32 # set(USE_MKL ) if using `pip install mkl` set(USE_MKL OFF) # Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library set(USE_MKLDNN OFF) # Whether use OpenMP thread pool, choices: gnu, intel # Note: "gnu" uses gomp library, "intel" uses iomp5 library set(USE_OPENMP none) # Whether use contrib.random in runtime set(USE_RANDOM ON) # Whether use NNPack set(USE_NNPACK OFF) # Possible values: # - ON: enable tflite with cmake's find search # - OFF: disable tflite # - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library set(USE_TFLITE OFF) # /path/to/tensorflow: tensorflow root path when use tflite library set(USE_TENSORFLOW_PATH none) # Required for full builds with TFLite. Not needed for runtime with TFLite. # /path/to/flatbuffers: flatbuffers root path when using tflite library set(USE_FLATBUFFERS_PATH none) # Possible values: # - OFF: disable tflite support for edgetpu # - /path/to/edgetpu: use specific path to edgetpu library set(USE_EDGETPU OFF) # Possible values: # - ON: enable cuDNN with cmake's auto search in CUDA directory # - OFF: disable cuDNN # - /path/to/cudnn: use specific path to cuDNN path set(USE_CUDNN OFF) # Whether use cuBLAS set(USE_CUBLAS OFF) # Whether use MIOpen set(USE_MIOPEN OFF) # Whether use MPS set(USE_MPS OFF) # Whether use rocBlas set(USE_ROCBLAS OFF) # Whether use contrib sort set(USE_SORT ON) # Whether use MKL-DNN (DNNL) codegen set(USE_DNNL_CODEGEN OFF) # Whether to use Arm Compute Library (ACL) codegen # We provide 2 separate flags since we cannot build the ACL runtime on x86. # This is useful for cases where you want to cross-compile a relay graph # on x86 then run on AArch. # # An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst. # # USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported # operators to Arm Compute Library. OFF/ON # USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL # runtime. OFF/ON/"path/to/ACL" set(USE_ARM_COMPUTE_LIB OFF) set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF) # Whether to build with Arm Ethos-N support # Possible values: # - OFF: disable Arm Ethos-N support # - path/to/arm-ethos-N-stack: use a specific version of the # Ethos-N driver stack set(USE_ETHOSN OFF) # If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine # otherwise use ETHOSN_HW (OFF) to use the software test infrastructure set(USE_ETHOSN_HW OFF) # Whether to build with Arm(R) Ethos(TM)-U NPU codegen support set(USE_ETHOSU OFF) # Whether to build with TensorRT codegen or runtime # Examples are available here: docs/deploy/tensorrt.rst. # # USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are # offloaded to TensorRT. OFF/ON # USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of # TensorRT library. OFF/ON/"path/to/TensorRT" set(USE_TENSORRT_CODEGEN OFF) set(USE_TENSORRT_RUNTIME OFF) # Whether use VITIS-AI codegen set(USE_VITIS_AI OFF) # Build Verilator codegen and runtime set(USE_VERILATOR OFF) # Build ANTLR parser for Relay text format # Possible values: # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar) # - OFF: disable ANTLR # - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file set(USE_ANTLR OFF) # Whether use Relay debug mode set(USE_RELAY_DEBUG OFF) # Whether to build fast VTA simulator driver set(USE_VTA_FSIM OFF) # Whether to build cycle-accurate VTA simulator driver set(USE_VTA_TSIM OFF) # Whether to build VTA FPGA driver (device side only) set(USE_VTA_FPGA OFF) # Whether use Thrust set(USE_THRUST OFF) # Whether to build the TensorFlow TVMDSOOp module set(USE_TF_TVMDSOOP OFF) # Whether to build the PyTorch custom class module set(USE_PT_TVMDSOOP OFF) # Whether to use STL's std::unordered_map or TVM's POD compatible Map set(USE_FALLBACK_STL_MAP OFF) # Whether to use hexagon device set(USE_HEXAGON_DEVICE OFF) set(USE_HEXAGON_SDK /path/to/sdk) # Whether to build the hexagon launcher set(USE_HEXAGON_LAUNCHER OFF) # Hexagon architecture to target when compiling TVM itself (not the target for # compiling _by_ TVM). This applies to components like the TVM runtime, but is # also used to select correct include/library paths from the Hexagon SDK when # building offloading runtime for Android. # Valid values are v60, v62, v65, v66, v68. set(USE_HEXAGON_ARCH "v66") # Whether to use ONNX codegen set(USE_TARGET_ONNX OFF) # Whether enable BNNS runtime set(USE_BNNS OFF) # Whether to use libbacktrace # Libbacktrace provides line and column information on stack traces from errors. # It is only supported on linux and macOS. # Possible values: # - AUTO: auto set according to system information and feasibility # - ON: enable libbacktrace # - OFF: disable libbacktrace set(USE_LIBBACKTRACE AUTO) # Whether to build static libtvm_runtime.a, the default is to build the dynamic # version: libtvm_runtime.so. # # The static runtime library needs to be linked into executables with the linker # option --whole-archive (or its equivalent). The reason is that the TVM registry # mechanism relies on global constructors being executed at program startup. # Global constructors alone are not sufficient for the linker to consider a # library member to be used, and some of such library members (object files) may # not be included in the final executable. This would make the corresponding # runtime functions to be unavailable to the program. set(BUILD_STATIC_RUNTIME OFF) # Caches the build so that building is faster when switching between branches. # If you switch branches, build and then encounter a linking error, you may # need to regenerate the build tree through "make .." (the cache will # still provide significant speedups). # Possible values: # - AUTO: search for path to ccache, disable if not found. # - ON: enable ccache by searching for the path to ccache, report an error if not found # - OFF: disable ccache # - /path/to/ccache: use specific path to ccache set(USE_CCACHE AUTO) # Whether to enable PAPI support in profiling. PAPI provides access to hardware # counters while profiling. # Possible values: # - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc # - OFF: disable PAPI support. # - /path/to/folder/containing/: Path to folder containing papi.pc. set(USE_PAPI OFF) # Whether to use GoogleTest for C++ unit tests. When enabled, the generated # build file (e.g. Makefile) will have a target "cpptest". # Possible values: # - ON: enable GoogleTest. The package `GTest` will be required for cmake # to succeed. # - OFF: disable GoogleTest. # - AUTO: cmake will attempt to find the GTest package, if found GTest will # be enabled, otherwise it will be disabled. # Note that cmake will use `find_package` to find GTest. Please use cmake's # predefined variables to specify the path to the GTest package if needed. set(USE_GTEST AUTO) # Enable using CUTLASS as a BYOC backend # Need to have USE_CUDA=ON set(USE_CUTLASS OFF) ================================================ FILE: optimization/nebullvm/nebullvm/operations/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/base.py ================================================ import abc from typing import Dict, Union from loguru import logger from nebullvm.core.models import Device, DeviceType from nebullvm.tools.feedback_collector import FeedbackCollector from nebullvm.tools.utils import check_device class Operation(abc.ABC): def __init__(self): self._state = {} self.device = Device(DeviceType.CPU) self.execute_count = 0 self.logger = logger self.feedback_collector = None def set_feedback_collector(self, feedback_collector: FeedbackCollector): self.feedback_collector = feedback_collector for value in self.__dict__.values(): if isinstance(value, Operation): value.set_feedback_collector(feedback_collector) @abc.abstractmethod def execute(self, **kwargs): raise NotImplementedError() @property def state(self) -> Dict[str, any]: return self._state def to(self, device: Union[str, Device]): if isinstance(device, str): self.device = check_device(device) else: self.device = device return self ================================================ FILE: optimization/nebullvm/nebullvm/operations/conversions/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/conversions/converters.py ================================================ import abc from pathlib import Path from typing import Optional, List, Union from nebullvm.core.models import DeviceType, DeepLearningFramework, ModelParams from nebullvm.operations.base import Operation from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx from nebullvm.operations.conversions.tensorflow import convert_tf_to_onnx from nebullvm.optional_modules.onnx import onnx from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.tools.data import DataManager class Converter(Operation, abc.ABC): ONNX_EXTENSION = ".onnx" TORCH_EXTENSION = ".pt" TF_EXTENSION = ".pb" SUPPORTED_DEVICES = [DeviceType.GPU, DeviceType.CPU] def __init__(self, model_name: Optional[str] = None): super().__init__() self.model = None self.data = None self.converted_models = None self.model_params = None self.device = None self.model_name = model_name or "temp" def set_state( self, model: Union[torch.nn.Module, tf.Module, str], data: DataManager ): self.model = model self.data = data return self def get_result(self) -> List: return [model for model in self.converted_models if model is not None] class PytorchConverter(Converter): DEST_FRAMEWORKS = [DeepLearningFramework.NUMPY] def execute( self, save_path: Path, model_params: ModelParams, ): self.converted_models = [self.model] if self.device.type not in self.SUPPORTED_DEVICES: return for framework in self.DEST_FRAMEWORKS: if framework is DeepLearningFramework.NUMPY: self.onnx_conversion(save_path, model_params) else: raise NotImplementedError() def onnx_conversion(self, save_path, model_params): onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}" onnx_model_path = convert_torch_to_onnx( torch_model=self.model, input_data=self.data, model_params=model_params, output_file_path=onnx_path, device=self.device, ) if self.converted_models is None: self.converted_models = [onnx_model_path] else: self.converted_models.append(onnx_model_path) def tensorflow_conversion(self): # TODO: Implement conversion from Pytorch to Tensorflow raise NotImplementedError() class TensorflowConverter(Converter): DEST_FRAMEWORKS = [DeepLearningFramework.NUMPY] def execute( self, save_path: Path, model_params: ModelParams, ): self.converted_models = [self.model] if self.device.type not in self.SUPPORTED_DEVICES: return for framework in self.DEST_FRAMEWORKS: if framework is DeepLearningFramework.NUMPY: self.onnx_conversion(save_path, model_params) else: raise NotImplementedError() def onnx_conversion(self, save_path, model_params): onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}" onnx_model_path = convert_tf_to_onnx( model=self.model, model_params=model_params, output_file_path=onnx_path, ) if self.converted_models is None: self.converted_models = [onnx_model_path] else: self.converted_models.append(onnx_model_path) def pytorch_conversion(self): # TODO: Implement conversion from Tensorflow to Pytorch raise NotImplementedError() class ONNXConverter(Converter): DEST_FRAMEWORKS = [] def execute(self, save_path, model_params): onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}" try: model_onnx = onnx.load(str(self.model)) onnx.save(model_onnx, str(onnx_path)) except Exception: self.logger.error( "The provided onnx model path is invalid. Please provide" " a valid path to a model in order to use Nebullvm." ) self.converted_models = [] self.converted_models = [str(onnx_path)] def tensorflow_conversion(self): # TODO: Implement conversion from ONNX to Tensorflow raise NotImplementedError() def pytorch_conversion(self): # TODO: Implement conversion from ONNX to Pytorch raise NotImplementedError() ================================================ FILE: optimization/nebullvm/nebullvm/operations/conversions/huggingface.py ================================================ from typing import ( List, Dict, Sequence, Optional, ) import numpy as np from nebullvm.core.models import Device from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.optional_modules.huggingface import ( PreTrainedTokenizer, PreTrainedModel, ) from nebullvm.tools.huggingface import ( get_output_structure_from_dict, get_output_structure_from_text, PyTorchTransformerWrapper, TensorFlowTransformerWrapper, ) from nebullvm.tools.utils import is_dict_type class _HFTextDataset(Sequence): def __init__( self, input_texts: List, ys: Optional[List], keywords: List[str], batch_size: int, tokenizer: PreTrainedTokenizer, tokenizer_args: Dict, ): self._input_texts = input_texts self._ys = ys self._bs = batch_size self._keys = keywords self._tokenizer = tokenizer if self._tokenizer.pad_token is None: self._tokenizer.pad_token = self._tokenizer.eos_token _tokenizer_args = {"truncation": True, "padding": True} _tokenizer_args.update(tokenizer_args) self._tokenizer_args = _tokenizer_args def __getitem__(self, item: int): pointer = self._bs * item if pointer >= len(self._input_texts): raise IndexError mini_batch = self._input_texts[ pointer : pointer + self._bs # noqa E203 ] if self._ys is not None: mini_batch_y = self._ys[pointer : pointer + self._bs] # noqa E203 else: mini_batch_y = None encoded_inputs = self._tokenizer(mini_batch, **self._tokenizer_args) return tuple(encoded_inputs[key] for key in self._keys), mini_batch_y def __len__(self): return len(self._input_texts) // self._bs class _HFDictDataset(Sequence): def __init__( self, input_data: List, ys: Optional[List], keywords: List[str], ): self._input_data = input_data self._ys = ys self._keys = keywords def __getitem__(self, item: int): pointer = item if pointer >= len(self._input_data): raise IndexError mini_batch = self._input_data[pointer] if self._ys is not None: mini_batch_y = self._ys[pointer] else: mini_batch_y = None return ( tuple(self._concatenate(mini_batch, key) for key in self._keys), mini_batch_y, ) def __len__(self): return len(self._input_data) @staticmethod def _concatenate(mini_batch, key): if isinstance(mini_batch[key], torch.Tensor): return torch.concat([mini_batch[key]]) elif isinstance(mini_batch[key], tf.Tensor): return tf.concat([mini_batch[key]], 0) else: return np.concatenate([mini_batch[key]]) def convert_hf_model( model: PreTrainedModel, input_data: List, device: Device, tokenizer: Optional[PreTrainedTokenizer] = None, tokenizer_args: Optional[Dict] = None, batch_size: int = 1, **kwargs, ): if is_dict_type(input_data[0]): # already tokenized data if "labels" in input_data[0]: labels = [data.pop("labels") for data in input_data] else: labels = None input_example = input_data[0] output_structure, output_type = get_output_structure_from_dict( input_example=input_example, model=model, device=device, ) input_data = _HFDictDataset( input_data=input_data, ys=labels, keywords=list(input_example.keys()), ) else: assert tokenizer is not None, ( "Tokenizer is needed when passing data in string format. Please " "provide the tokenizer as keyword argument." ) if tokenizer_args is None: tokenizer_args = {} if not isinstance(input_data[0], str): ys = [data[1] for data in input_data] input_data = [data[0] for data in input_data] else: ys = None output_structure, output_type = get_output_structure_from_text( text=input_data[0], model=model, tokenizer=tokenizer, tokenizer_args=tokenizer_args, device=device, ) input_example = tokenizer(input_data, **tokenizer_args) input_data = _HFTextDataset( input_texts=input_data, ys=ys, keywords=list(input_example.keys()), batch_size=batch_size, tokenizer=tokenizer, tokenizer_args=tokenizer_args, ) if isinstance(model, torch.nn.Module): wrapper_model = PyTorchTransformerWrapper( core_model=model, encoded_input=input_example ) else: wrapper_model = TensorFlowTransformerWrapper( core_model=model, encoded_input=input_example ) return ( wrapper_model, input_data, list(wrapper_model.inputs_types.keys()), output_structure, output_type, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/conversions/pytorch.py ================================================ from contextlib import nullcontext from pathlib import Path from loguru import logger from nebullvm.config import ONNX_OPSET_VERSION from nebullvm.core.models import ModelParams, Device, DeviceType, DataType from nebullvm.optional_modules.torch import torch, Module from nebullvm.tools.data import DataManager from nebullvm.tools.pytorch import ( create_model_inputs_torch, ) @torch.inference_mode() def convert_torch_to_onnx( torch_model: Module, input_data: DataManager, model_params: ModelParams, output_file_path: Path, device: Device, ): """Function importing a custom model in pytorch and converting it in ONNX Args: torch_model (Module): Pytorch model. input_data (DataManager): Custom data provided by user to be used as input for the converter. model_params (ModelParams): Model Parameters as input sizes and dynamic axis information. output_file_path (str or Path): Path where storing the output ONNX file. device (Device): Device where the model will be run. """ if input_data is not None: input_tensors = list(input_data.get_list(1)[0]) else: input_tensors = create_model_inputs_torch(model_params.input_infos) output_sizes = model_params.output_sizes output_types = model_params.output_types input_names = [f"input_{i}" for i in range(len(input_tensors))] output_names = [f"output_{i}" for i in range(len(output_sizes))] dynamic_info = model_params.dynamic_info if dynamic_info is not None: # This check is needed to enable backward compatibility with # previous versions of nebullvm if isinstance(list(dynamic_info.inputs[0].values())[0], str): onnx_format_inputs = dynamic_info.inputs else: onnx_format_inputs = [ {k: v["name"] for (k, v) in d.items()} for d in dynamic_info.inputs ] assert len(dynamic_info.outputs) == len(output_names), ( f"The number of dynamic outputs provided in the dynamic info " f"dict ({len(dynamic_info.outputs)}) is not equal to the number " f"of outputs of the model ({len(output_names)}), Detected model " f"output shapes are: {output_sizes} " ) dynamic_info = { name: dynamic_dict for name, dynamic_dict in zip( input_names + output_names, onnx_format_inputs + dynamic_info.outputs, ) } try: # try conversion with model on cpu if device.type is DeviceType.GPU: input_tensors = [x.cpu() for x in input_tensors] torch_model.cpu() torch.onnx.export( torch_model, # model being run tuple( input_tensors ), # model input (or a tuple for multiple inputs) str(output_file_path), # where to save the model (can be a file or file-like object) export_params=True, # store the trained parameter weights inside the model file opset_version=ONNX_OPSET_VERSION, # the ONNX version to export the model to do_constant_folding=True, # whether to execute constant folding for optimization input_names=input_names, # the model's input names output_names=output_names, # the model's output names dynamic_axes=dynamic_info, ) # Put again model on gpu if device.type is DeviceType.GPU: torch_model.to(device.to_torch_format()) return output_file_path except Exception: # try conversion with model on gpu if device.type is DeviceType.GPU: input_tensors = [ x.to(device.to_torch_format()) for x in input_tensors ] torch_model.to(device.to_torch_format()) try: with torch.autocast("cuda") if output_types[ 0 ] is DataType.FLOAT16 else nullcontext(): torch.onnx.export( torch_model, # model being run tuple( input_tensors ), # model input (or a tuple for multiple inputs) str(output_file_path), # where to save the model # (can be a file or file-like object) export_params=True, # store the trained parameter weights inside the model opset_version=ONNX_OPSET_VERSION, # the ONNX version to export the model to do_constant_folding=True, # whether to execute constant folding for optimization input_names=input_names, # the model's input names output_names=output_names, # the model's output names dynamic_axes=dynamic_info, ) return output_file_path except Exception: logger.warning( "Exception raised during conversion from torch" " to onnx model. ONNX pipeline will be unavailable." ) return None else: logger.warning( "Exception raised during conversion from torch" " to onnx model. ONNX pipeline will be unavailable." ) return None ================================================ FILE: optimization/nebullvm/nebullvm/operations/conversions/tensorflow.py ================================================ import subprocess from pathlib import Path from tempfile import TemporaryDirectory from typing import Union from loguru import logger from nebullvm.config import ONNX_OPSET_VERSION from nebullvm.core.models import ModelParams from nebullvm.optional_modules.tensorflow import tensorflow as tf, tf2onnx from nebullvm.optional_modules.onnx import onnx from nebullvm.tools.huggingface import TensorFlowTransformerWrapper def convert_tf_to_onnx( model: Union[tf.Module, tf.keras.Model], model_params: ModelParams, output_file_path: Union[str, Path], ): """Convert TF models into ONNX. Args: model (Union[tf.Module, tf.keras.Model]): TF model. model_params (ModelParams): Info about model parameters. output_file_path (Path): Path where storing the output file. """ try: if isinstance(model, tf.keras.Model) or ( isinstance(model, TensorFlowTransformerWrapper) and isinstance(model.core_model, tf.keras.Model) ): return convert_keras_to_onnx(model, model_params, output_file_path) else: return convert_tf_saved_model_to_onnx(model, output_file_path) except Exception: logger.warning( "Something went wrong during conversion from tensorflow" " to onnx model. ONNX pipeline will be unavailable." ) return None def convert_tf_saved_model_to_onnx( model: tf.Module, output_file_path: Union[str, Path] ): """Convert TF models into ONNX. Args: model (tf.Module): TF model. output_file_path (Path): Path where storing the output file. """ with TemporaryDirectory() as temp_dir: tf.saved_model.save(model, export_dir=temp_dir) try: subprocess.check_output(["python3", "--version"]) python_cmd = "python3" except subprocess.CalledProcessError: python_cmd = "python" onnx_cmd = [ python_cmd, "-m", "tf2onnx.convert", "--saved-model", f"{temp_dir}", "--output", f"{output_file_path}", "--opset", f"{ONNX_OPSET_VERSION}", ] subprocess.run(onnx_cmd) onnx.load(output_file_path) return output_file_path def convert_keras_to_onnx( model: tf.keras.Model, model_params: ModelParams, output_file_path: Union[str, Path], ): """Convert keras models into ONNX. Args: model (tf.keras.Model): keras model. model_params (ModelParams): Model Parameters as input sizes and dynamic axis information. output_file_path (Path): Path where storing the output file. """ # get data types for each input dtypes = [ model_params.input_infos[i].dtype.value for i in range(len(model_params.input_infos)) ] # get input shapes for each input shapes = [ [int(x) for x in model_params.input_infos[i].size] for i in range(len(model_params.input_infos)) ] # set the dynamic axes for each input if isinstance(model, TensorFlowTransformerWrapper): names = list(model.inputs_types.keys()) else: names = [f"input_{i}" for i in range(len(model_params.input_infos))] input_signature = tuple( tf.TensorSpec( ( None if model_params.dynamic_info is not None and dim in model_params.dynamic_info.inputs[i] else shape[dim] for dim in range(len(shape)) ), dtype, name=name, ) for i, (shape, dtype, name) in enumerate(zip(shapes, dtypes, names)) ) onnx_model, _ = tf2onnx.convert.from_keras( model, input_signature, opset=ONNX_OPSET_VERSION, output_path=output_file_path, ) return output_file_path ================================================ FILE: optimization/nebullvm/nebullvm/operations/conversions/utils.py ================================================ from nebullvm.core.models import DeepLearningFramework from nebullvm.operations.conversions.converters import ( PytorchConverter, TensorflowConverter, ONNXConverter, Converter, ) def get_conversion_op(framework: DeepLearningFramework) -> Converter: if framework == DeepLearningFramework.PYTORCH: conversion_op = PytorchConverter() elif framework == DeepLearningFramework.TENSORFLOW: conversion_op = TensorflowConverter() else: conversion_op = ONNXConverter() return conversion_op ================================================ FILE: optimization/nebullvm/nebullvm/operations/fetch_operations/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/fetch_operations/local.py ================================================ from typing import Any, Union, Iterable, Sequence from nebullvm.operations.base import Operation class FetchModelFromLocal(Operation): def execute(self, model: Any): self.state["model"] = model def get_model(self) -> any: return self.state.get("model") def get_result(self) -> Any: pass class FetchDataFromLocal(Operation): def execute(self, data: Union[Iterable, Sequence]): self.state["data"] = data def get_data(self) -> any: return self.state.get("data") def get_result(self) -> Any: pass ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/base.py ================================================ import json import os import shutil from abc import ABC, abstractmethod from dataclasses import dataclass, InitVar from pathlib import Path from tempfile import mkdtemp, TemporaryDirectory from typing import Union, Dict, Any, List, Optional import numpy as np from nebullvm.config import LEARNER_METADATA_FILENAME from nebullvm.core.models import ModelParams, Device, QuantizationType from nebullvm.operations.base import Operation from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.tools.onnx import create_model_inputs_onnx from nebullvm.tools.pytorch import ( create_model_inputs_torch, get_torch_model_size, ) from nebullvm.tools.tf import create_model_inputs_tf from nebullvm.tools.transformations import MultiStageTransformation class BuildInferenceLearner(Operation, ABC): def __init__(self): super().__init__() self.inference_learner = None @abstractmethod def execute(self, **kwargs): raise NotImplementedError() def get_result(self) -> Any: return self.inference_learner @dataclass class BaseInferenceLearner(ABC): """Base class for Inference Learners.""" network_parameters: ModelParams input_tfms: Optional[MultiStageTransformation] = None input_data: InitVar[List[Any]] = None device: Device = None quantization_type: QuantizationType = None @property @abstractmethod def name(self) -> str: """The name of the InferenceLearner""" def __post_init__(self, input_data): if self.input_tfms is not None and len(self.input_tfms) < 0: self.input_tfms = None self._tmp_folder = Path(mkdtemp()) self._input_data = input_data def _store_file(self, file_path: Union[str, Path]): return shutil.copy(str(file_path), str(self._tmp_folder)) def _store_dir(self, dir_path: Union[str, Path]): try: # For python >= 3.8 return shutil.copytree( str(dir_path), str(self._tmp_folder), dirs_exist_ok=True ) except TypeError: # For python <=3.7 if os.path.isdir(self._tmp_folder): shutil.rmtree(str(self._tmp_folder)) return shutil.copytree(str(dir_path), str(self._tmp_folder)) def __del__(self, shutil=shutil): try: shutil.rmtree(self._tmp_folder, ignore_errors=True) except Exception: pass def predict_from_files( self, input_files: List[str], output_files: List[str] ): """Get a model prediction from file. The input file is read, processed and a prediction is run on top of it. The prediction is then returned into another file (in the same directory of the input file itself). Args: input_files (List[str]): List of paths to the input file. output_files (List[str]): List of paths to the file storing the prediction. """ inputs = (self._read_file(input_file) for input_file in input_files) preds = self(*inputs) for pred, output_file in zip(preds, output_files): self._save_file(pred, output_file) def predict_from_listified_tensors(self, *listified_tensors: List): """Predict from listified tensor. Method useful to be used in services receiving the input tensor from an HTTP call. Args: listified_tensors (List): List of list-like version of the input tensors. Note that each element of the external list is a listified input tensor. Returns: List: List of list-like predictions. """ inputs = ( self.list2tensor(listified_tensor) for listified_tensor in listified_tensors ) if self.input_tfms is not None: inputs = (self.input_tfms(_input) for _input in inputs) preds = self.predict(*inputs) return [self.tensor2list(pred) for pred in preds] def list2tensor(self, listified_tensor: List) -> Any: """Convert list to tensor. Args: listified_tensor (List): Listified version of the input tensor. Returns: Any: Tensor for the prediction. """ raise NotImplementedError() def tensor2list(self, tensor: Any) -> List: """Convert tensor to list. Args: tensor (any): Input tensor. Returns: List: Listified version of the tensor. """ raise NotImplementedError() def _read_file(self, input_file: str) -> Any: """Read tensor from file. Args: input_file (str): Path to the file containing the input tensor. Returns: Any: Tensor read from the file. """ raise NotImplementedError() def _save_file(self, prediction: Any, output_file: str): """Save prediction in the appropriate format. Args: prediction (any): The predicted tensor. output_file (str): Path to the file where storing the prediction. """ raise NotImplementedError def predict(self, *args, **kwargs) -> Any: """Take as input a tensor and returns a prediction""" out = self(*args, **kwargs) # TensorFlow predict method must return a np array if isinstance(out[0], tf.Tensor): out = tuple(t.numpy() for t in out) return out @abstractmethod def run(self, *args, **kwargs) -> Any: """Abstract method implementing the prediction code.""" raise NotImplementedError() def forward(self, *args, **kwargs): """Alternative method to the predict one.""" return self(*args, **kwargs) def __call__(self, *args, **kwargs): if self.input_tfms is not None: args = (self.input_tfms(_input) for _input in args) return self.run(*args, **kwargs) def save(self, path: Union[str, Path], **kwargs): """Save the model. Args: path (Path): Path to the directory where saving the model. """ raise NotImplementedError() @classmethod def load(cls, path: Union[Path, str], **kwargs): """Load the model. Args: path (Path): Path to the directory where the model is stored. Returns: BaseInferenceLearner: Loaded model. """ raise NotImplementedError() @abstractmethod def get_size(self): """The function returns the size of the optimized model.""" raise NotImplementedError() @abstractmethod def free_gpu_memory(self): """The function cleans the gpu occupied by the inference learner.""" raise NotImplementedError @abstractmethod def get_inputs_example(self): """The function returns an example of the input for the optimized model predict method. """ raise NotImplementedError() @property @abstractmethod def output_format(self): return ".txt" @property @abstractmethod def input_format(self): return ".txt" class LearnerMetadata: """Class for storing all the metadata about a model. The stored information can be used for loading the appropriate model. Attributes: class_name (str): Name of the model class. For instance, for the model object `CustomModel()`, the class name is 'CustomModel'. module_name (str): Path to the python module where the model class is defined. network_parameters (Dict): Dictionaty containing the network parameters, i.e. batch_size, input_size and output_size. kwargs: External attributes that will be stored in the Metadata file. """ NAME: str = LEARNER_METADATA_FILENAME class_name: str module_name: str device: str quantization_type: str def __init__( self, class_name: str, module_name: str, network_parameters: Union[ModelParams, Dict], input_tfms: Union[MultiStageTransformation, Dict] = None, **kwargs, ): self.class_name = class_name self.module_name = module_name self.network_parameters = ( network_parameters.dict() if isinstance(network_parameters, ModelParams) else network_parameters ) self.input_tfms = ( input_tfms.to_dict() if isinstance(input_tfms, MultiStageTransformation) else input_tfms ) self.__dict__.update(**kwargs) def __getitem__(self, item): if not isinstance(item, str): raise TypeError( f"Error in key type. Expected str got {type(item)}" ) elif item.startswith("_"): raise ValueError("Trying to access a private attribute.") return self.__dict__.get(item) @classmethod def from_model(cls, model: BaseInferenceLearner, **kwargs): """Create the metadata from the Inference Learner. Args: model (BaseInferenceLearner): Model from which extract the metadata. kwargs: External attributes that will be stored in the Metadata file. Returns: LearnerMetadata: Metadata associated with the model. """ return cls( class_name=model.__class__.__name__, module_name=model.__module__, network_parameters=model.network_parameters, input_tfms=model.input_tfms, device=model.device.type.value if model.device is not None else None, quantization_type=model.quantization_type.value if model.quantization_type is not None else None, **kwargs, ) @classmethod def from_dict(cls, dictionary: Dict): """Create the metadata file from a dictionary. This method is the reverse one of `to_dict`. Args: dictionary (Dict): Dictionary containing the metadata. Returns: LearnerMetadata: Metadata associated with the model. """ if any( key not in dictionary for key in ("class_name", "module_name", "network_parameters") ): raise ValueError( "The input dictionary should contain both the model class " "name and module." ) return cls(**dictionary) def to_dict(self) -> Dict: """Method for converting the LearnerMetadata in a python dictionary. Returns: Dict: Dictionary containing the metadata. """ return { key: value for key, value in self.__dict__.items() if ( len(key) > 0 and key[0].islower() and not key.startswith("_") and value is not None ) } @classmethod def read(cls, path: Union[Path, str]): """Read the metadata file and store it into a LearnerMetadata object. Args: path (Path): Path to the directory containing the metadata file. Returns: LearnerMetadata: Metadata associated with the model. """ path = Path(path) with open(path / cls.NAME, "r") as fin: metadata_dict = json.load(fin) return cls(**metadata_dict) def save(self, path: Union[Path, str]): """Save the metadata of the model in a file. Args: path (Path): Path to the directory where saving the model metadata. """ path = Path(path) path.mkdir(exist_ok=True) metadata_dict = self.to_dict() with open(path / self.NAME, "w") as fout: json.dump(metadata_dict, fout) def load_model( self, path: Union[Path, str], **kwargs ) -> BaseInferenceLearner: """Method for loading the InferenceLearner from its metadata. The ModelMetadata file contains all the information necessary for loading the Learner, as it contains both the module where the model is defined and the class name of the model object. This method calls the appropriate class method of the Model object, thus the actual model loading is delegate to its methods. Args: path (Path): Path to the directory containing the files where the model optimization is saved. kwargs: Dictionary containing the arguments for the model's load function. """ exec(f"from {self.module_name} import {self.class_name}") model = eval(self.class_name).load(path=path, **kwargs) return model class PytorchBaseInferenceLearner(BaseInferenceLearner, ABC): @property def input_format(self): return ".pt" @property def output_format(self): return ".pt" def list2tensor(self, listified_tensor: List) -> torch.Tensor: """Convert list to tensor. Args: listified_tensor (List): Listified version of the input tensor. Returns: torch.Tensor: Tensor for the prediction. """ return torch.tensor(listified_tensor) def tensor2list(self, tensor: torch.Tensor) -> List: """Convert tensor to list. Args: tensor (any): Input tensor. Returns: List: Listified version of the tensor. """ return tensor.cpu().detach().numpy().tolist() def free_gpu_memory(self): self.model.cpu() self._is_gpu_ready = False def set_model_on_gpu(self): self.model.to(self.device.to_torch_format()) self._is_gpu_ready = True def _read_file(self, input_file: Union[str, Path]) -> torch.Tensor: input_tensor = torch.load(input_file) return input_tensor def _save_file( self, prediction: torch.Tensor, output_file: Union[str, Path] ): torch.save(prediction, output_file) def get_inputs_example(self, random=False): if self._input_data is None or random: return tuple( create_model_inputs_torch( input_infos=self.network_parameters.input_infos, ) ) else: return self._input_data def get_size(self): try: if hasattr(self.model, "core_model"): return get_torch_model_size(self.model.core_model) else: # Normal torch model return get_torch_model_size(self.model) except RuntimeError: with TemporaryDirectory() as tmp_dir: self.save(tmp_dir) return sum( os.path.getsize(Path(tmp_dir) / f) for f in os.listdir(Path(tmp_dir)) if os.path.isfile(Path(tmp_dir) / f) ) class TensorflowBaseInferenceLearner(BaseInferenceLearner, ABC): @property def input_format(self): return ".npy" @property def output_format(self): return ".npy" def free_gpu_memory(self): tf.keras.backend.clear_session() self._is_gpu_ready = False def set_model_on_gpu(self): self._is_gpu_ready = True def list2tensor(self, listified_tensor: List) -> tf.Tensor: """Convert list to tensor. Args: listified_tensor (List): Listified version of the input tensor. Returns: tf.Tensor: Tensor ready to be used for prediction. """ return tf.convert_to_tensor(listified_tensor) def tensor2list(self, tensor: tf.Tensor) -> List: """Convert tensor to list. Args: tensor (tf.Tensor): Input tensor. Returns: List: Listified version of the tensor. """ return tensor.numpy().tolist() def _read_file(self, input_file: Union[str, Path]) -> tf.Tensor: numpy_array = np.load(input_file) input_tensor = tf.convert_to_tensor(numpy_array) return input_tensor def _save_file(self, prediction: tf.Tensor, output_file: Union[str, Path]): prediction.numpy().save(output_file) def get_inputs_example(self, random=False): if self._input_data is None or random: return tuple( create_model_inputs_tf( input_infos=self.network_parameters.input_infos, ) ) else: return self._input_data class NumpyBaseInferenceLearner(BaseInferenceLearner, ABC): @property def input_format(self): return ".npy" @property def output_format(self): return ".npy" def list2tensor(self, listified_tensor: List) -> np.ndarray: """Convert list to numpy arrays. Args: listified_tensor (List): Listified version of the input tensor. Returns: np.array: Tensor ready to be used for prediction. """ return np.array(listified_tensor) def tensor2list(self, tensor: np.ndarray) -> List: """Convert tensor to list. Args: tensor (tf.Tensor): Input tensor. Returns: List: Listified version of the tensor. """ return tensor.tolist() def _read_file(self, input_file: Union[str, Path]) -> np.ndarray: numpy_array = np.load(input_file) return numpy_array def _save_file( self, prediction: np.ndarray, output_file: Union[str, Path] ): np.save(output_file, prediction) def get_inputs_example(self, random=False): if self._input_data is None or random: return tuple( create_model_inputs_onnx( input_infos=self.network_parameters.input_infos, ) ) else: return self._input_data class InferenceLearnerWrapper(BaseInferenceLearner, ABC): """Wrapper model around InferenceLearners. It's a base class: cannot be instantiated. For all the BaseInferenceLearner-related methods, the implementation of the core model will be used. This class just re-implement the load and save methods, allowing (and forcing) then the child class to re-implement the `predict` method. Attributes: network_parameters (ModelParams): Model parameters. core_inference_learner (BaseInferenceLearner): Inference Learner. """ CORE_MODEL_SAVE_DIR = "core_model" def __init__(self, core_inference_learner: BaseInferenceLearner): super().__init__( network_parameters=core_inference_learner.network_parameters ) self.core_inference_learner = core_inference_learner def list2tensor(self, listified_tensor: List) -> Any: return self.core_inference_learner.list2tensor(listified_tensor) def tensor2list(self, tensor: Any) -> List: return self.core_inference_learner.tensor2list(tensor) def _read_file(self, input_file: str) -> Any: return self.core_inference_learner._read_file(input_file) def _save_file(self, prediction: Any, output_file: str): self.core_inference_learner._save_file(prediction, output_file) def save(self, path: Union[str, Path], **kwargs): core_model_path = Path(path) / self.CORE_MODEL_SAVE_DIR core_model_path.mkdir(exist_ok=True, parents=True) self.core_inference_learner.save(core_model_path, **kwargs) extra_metadata_kwargs = self._get_extra_metadata_kwargs() metadata = LearnerMetadata.from_model(self, **extra_metadata_kwargs) metadata.save(path) self._save_wrapper_extra_info() def _get_extra_metadata_kwargs(self) -> Dict: raise NotImplementedError def _save_wrapper_extra_info(self): raise NotImplementedError @staticmethod def _convert_metadata_to_inputs(metadata: LearnerMetadata) -> Dict: raise NotImplementedError @staticmethod def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict: raise NotImplementedError @classmethod def load(cls, path: Union[Path, str], **kwargs): core_model_path = Path(path) / cls.CORE_MODEL_SAVE_DIR core_learner = LearnerMetadata.read(core_model_path).load_model( core_model_path, **kwargs ) metadata = LearnerMetadata.read(path) input_dict = cls._convert_metadata_to_inputs(metadata) input_dict = cls._load_wrapper_extra_info(input_dict) input_dict.update({"core_inference_learner": core_learner}) return cls(**input_dict) def free_gpu_memory(self): return self.core_inference_learner.free_gpu_memory() def get_inputs_example(self): return self.core_inference_learner.get_inputs_example() @property def output_format(self): return self.core_inference_learner.output_format @property def input_format(self): return self.core_inference_learner.input_format ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/blade_disc.py ================================================ from typing import Optional from nebullvm.core.models import ModelParams, Device from nebullvm.operations.inference_learners.torchscript import ( TorchScriptInferenceLearner, ) from nebullvm.optional_modules.torch import ScriptModule from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation class BladeDISCInferenceLearner(TorchScriptInferenceLearner): name = "BladeDISC" @classmethod def from_torch_model( cls, model: ScriptModule, network_parameters: ModelParams, device: Device, input_tfms: Optional[MultiStageTransformation] = None, input_data: DataManager = None, ): return cls( torch_model=model, network_parameters=network_parameters, input_tfms=input_tfms, input_data=input_data, device=device, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/builders.py ================================================ from pathlib import Path from typing import Any, Union from nebullvm.core.models import ( ModelParams, DeepLearningFramework, QuantizationType, DeviceType, ) from nebullvm.operations.inference_learners.base import BuildInferenceLearner from nebullvm.operations.inference_learners.deepsparse import ( PytorchDeepSparseInferenceLearner, ) from nebullvm.operations.inference_learners.faster_transformer import ( FasterTransformerInferenceLearner, ) from nebullvm.operations.inference_learners.neural_compressor import ( PytorchNeuralCompressorInferenceLearner, ) from nebullvm.operations.inference_learners.onnx import ONNX_INFERENCE_LEARNERS from nebullvm.operations.inference_learners.openvino import ( OPENVINO_INFERENCE_LEARNERS, ) from nebullvm.operations.inference_learners.tensor_rt import ( TENSOR_RT_INFERENCE_LEARNERS, PytorchTensorRTInferenceLearner, ) from nebullvm.operations.inference_learners.tensorflow import ( TensorflowBackendInferenceLearner, TFLiteBackendInferenceLearner, ) from nebullvm.operations.inference_learners.torch_dynamo import ( TorchDynamoInferenceLearner, ) from nebullvm.operations.inference_learners.torch_neuron import ( TorchNeuronInferenceLearner, ) from nebullvm.operations.inference_learners.torch_xla import ( TorchXLAInferenceLearner, ) from nebullvm.operations.inference_learners.torchscript import ( TorchScriptInferenceLearner, ) from nebullvm.operations.inference_learners.tvm import ( APACHE_TVM_INFERENCE_LEARNERS, PytorchApacheTVMInferenceLearner, ) from nebullvm.optional_modules.tensor_rt import tensorrt as trt from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import ( ScriptModule, Module, GraphModule, torch, ) from nebullvm.optional_modules.tvm import tvm, ExecutorFactoryModule from nebullvm.tools.onnx import get_input_names, get_output_names from nebullvm.tools.transformations import ( MultiStageTransformation, VerifyContiguity, ) class TorchScriptBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: ScriptModule, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): self.inference_learner = TorchScriptInferenceLearner( torch_model=model, network_parameters=model_params, input_tfms=input_tfms, device=self.device, ) class TorchXLABuildInferenceLearner(BuildInferenceLearner): def execute( self, model: torch.nn.Module, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): self.inference_learner = TorchXLAInferenceLearner( torch_model=model, network_parameters=model_params, input_tfms=input_tfms, device=self.device, ) class TorchNeuronBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: ScriptModule, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): self.inference_learner = TorchNeuronInferenceLearner( torch_model=model, network_parameters=model_params, input_tfms=input_tfms, device=self.device, ) class TorchDynamoBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: ScriptModule, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): self.inference_learner = TorchDynamoInferenceLearner( torch_model=model, network_parameters=model_params, input_tfms=input_tfms, device=self.device, ) class TensorflowBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: tf.Module, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): self.inference_learner = TensorflowBackendInferenceLearner( model, network_parameters=model_params, input_tfms=input_tfms, device=self.device, ) class TFLiteBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: bytes, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): self.inference_learner = TFLiteBackendInferenceLearner( model, network_parameters=model_params, input_tfms=input_tfms, device=self.device, ) class DeepSparseBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: Union[str, Path], model_params: ModelParams, **kwargs, ): input_names = get_input_names(str(model)) output_names = get_output_names(str(model)) self.inference_learner = PytorchDeepSparseInferenceLearner( onnx_path=model, network_parameters=model_params, input_names=input_names, output_names=output_names, device=self.device, ) class ONNXBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: Union[str, Path], model_params: ModelParams, input_tfms: MultiStageTransformation, source_dl_framework: DeepLearningFramework, quantization_type: QuantizationType, **kwargs, ): input_names = get_input_names(str(model)) output_names = get_output_names(str(model)) self.inference_learner = ONNX_INFERENCE_LEARNERS[source_dl_framework]( onnx_path=model, network_parameters=model_params, input_names=input_names, output_names=output_names, input_tfms=input_tfms, device=self.device, quantization_type=quantization_type, ) class OpenVINOBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: str, model_params: ModelParams, input_tfms: MultiStageTransformation, source_dl_framework: DeepLearningFramework, **kwargs, ): self.inference_learner = OPENVINO_INFERENCE_LEARNERS[ source_dl_framework ].from_model_name( model_name=model + ".xml", model_weights=model + ".bin", input_tfms=input_tfms, network_parameters=model_params, device=self.device, ) class PyTorchTensorRTBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: ScriptModule, input_tfms: MultiStageTransformation, model_params: ModelParams, **kwargs, ): self.inference_learner = PytorchTensorRTInferenceLearner( torch_model=model, input_tfms=input_tfms, network_parameters=model_params, device=self.device, ) class ONNXTensorRTBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: Any, model_orig: Union[str, Path], model_params: ModelParams, input_tfms: MultiStageTransformation, source_dl_framework: DeepLearningFramework, **kwargs, ): nvidia_logger = trt.Logger(trt.Logger.ERROR) input_names = get_input_names(str(model_orig)) output_names = get_output_names(str(model_orig)) input_tfms.append(VerifyContiguity()) runtime = trt.Runtime(nvidia_logger) engine = runtime.deserialize_cuda_engine(model) self.inference_learner = TENSOR_RT_INFERENCE_LEARNERS[ source_dl_framework ]( engine=engine, input_tfms=input_tfms, network_parameters=model_params, input_names=input_names, output_names=output_names, nvidia_logger=nvidia_logger, device=self.device, ) class IntelNeuralCompressorBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: GraphModule, model_orig: Module, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): self.inference_learner = PytorchNeuralCompressorInferenceLearner( model=model_orig, model_quant=model, input_tfms=input_tfms, network_parameters=model_params, device=self.device, ) class PyTorchApacheTVMBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: ExecutorFactoryModule, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): target_device = ( str(tvm.target.cuda()) if self.device.type is DeviceType.GPU else "llvm" ) dev = tvm.device(str(target_device), 0) input_names = [ f"input_{i}" for i in range(len(model_params.input_infos)) ] graph_executor_module = tvm.contrib.graph_executor.GraphModule( model["default"](dev) ) self.inference_learner = PytorchApacheTVMInferenceLearner( input_tfms=input_tfms, network_parameters=model_params, graph_executor_module=graph_executor_module, input_names=input_names, lib=model, target=target_device, device=self.device, ) class ONNXApacheTVMBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: ExecutorFactoryModule, model_orig: str, model_params: ModelParams, input_tfms: MultiStageTransformation, source_dl_framework: DeepLearningFramework, **kwargs, ): target_device = ( str(tvm.target.cuda()) if self.device.type is DeviceType.GPU else "llvm" ) dev = tvm.device(str(target_device), 0) input_names = ( get_input_names(model_orig) if model_orig is not None else [f"input_{i}" for i in range(len(model_params.input_infos))] ) graph_executor_module = tvm.contrib.graph_executor.GraphModule( model["default"](dev) ) self.inference_learner = APACHE_TVM_INFERENCE_LEARNERS[ source_dl_framework ]( input_tfms=input_tfms, network_parameters=model_params, graph_executor_module=graph_executor_module, input_names=input_names, lib=model, target=target_device, device=self.device, ) class FasterTransformerBuildInferenceLearner(BuildInferenceLearner): def execute( self, model: ScriptModule, model_params: ModelParams, input_tfms: MultiStageTransformation, **kwargs, ): self.inference_learner = FasterTransformerInferenceLearner( torch_model=model, network_parameters=model_params, input_tfms=input_tfms, device=self.device, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/deepsparse.py ================================================ import os import shutil from abc import ABC from pathlib import Path from typing import Union, List, Generator, Tuple, Dict, Type import numpy as np from loguru import logger from nebullvm.config import ONNX_FILENAMES from nebullvm.core.models import Device, ModelParams, DeepLearningFramework from nebullvm.operations.inference_learners.base import ( BaseInferenceLearner, LearnerMetadata, PytorchBaseInferenceLearner, ) from nebullvm.optional_modules.deepsparse import cpu, compile_model from nebullvm.optional_modules.torch import torch from nebullvm.tools.transformations import MultiStageTransformation class DeepSparseInferenceLearner(BaseInferenceLearner, ABC): """Model optimized on CPU using DeepSparse. DeepSparse is an engine accelerating sparse computations on CPUs. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. onnx_path (str or Path): Path to the onnx model. input_names (List[str]): Input names used when the onnx model was produced. output_names (List[str]): Output names used when the onnx model was produced. """ name = "DeepSparse" def __init__( self, onnx_path: Union[str, Path], input_names: List[str], output_names: List[str], device: Device, **kwargs, ): super().__init__(**kwargs) self.onnx_path = self._store_file(onnx_path) # Compile model cores_per_socket, _, _ = cpu.cpu_details() # Define the number of cores to use, by default it will make use of # all physical cores on the system num_cores = cores_per_socket batch_size = kwargs["network_parameters"].batch_size self.engine = compile_model(onnx_path, batch_size, num_cores) self.input_names = input_names self.output_names = output_names self.device = device def get_size(self): return os.path.getsize(self.onnx_path) def save(self, path: Union[str, Path], **kwargs): """Save the model. Args: path (Path or str): Path to the directory where the model will be stored. kwargs (Dict): Dictionary of key-value pairs that will be saved in the model metadata file. """ metadata = LearnerMetadata.from_model( self, input_names=self.input_names, output_names=self.output_names, **kwargs, ) metadata.save(path) shutil.copy( self.onnx_path, Path(path) / ONNX_FILENAMES["model_name"], ) def free_gpu_memory(self): raise NotImplementedError("DeepSparse does not support GPU inference.") @classmethod def load(cls, path: Union[Path, str], **kwargs): """Load the model. Args: path (Path or str): Path to the directory where the model is stored. kwargs (Dict): Dictionary of additional arguments for consistency with other Learners. Returns: DeepSparseInferenceLearner: The optimized model. """ if len(kwargs) > 0: logger.warning( f"No extra keywords expected for the load method. " f"Got {kwargs}." ) onnx_path = os.path.join(str(path), ONNX_FILENAMES["model_name"]) metadata = LearnerMetadata.read(path) input_tfms = metadata.input_tfms if input_tfms is not None: input_tfms = MultiStageTransformation.from_dict( metadata.input_tfms ) device = Device.from_str(metadata.device) return cls( input_tfms=input_tfms, network_parameters=ModelParams(**metadata.network_parameters), onnx_path=onnx_path, input_names=metadata["input_names"], output_names=metadata["output_names"], device=device, ) def _predict_arrays(self, input_arrays: Generator[np.ndarray, None, None]): inputs = [array for array in input_arrays] outputs = self.engine(inputs) return outputs class PytorchDeepSparseInferenceLearner( DeepSparseInferenceLearner, PytorchBaseInferenceLearner ): """Model optimized on CPU using DeepSparse. DeepSparse is an engine accelerating sparse computations on CPUs. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. onnx_path (str or Path): Path to the onnx model. input_names (List[str]): Input names used when the onnx model was produced. output_names (List[str]): Output names used when the onnx model was produced. """ def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ input_arrays = ( input_tensor.cpu().detach().numpy() for input_tensor in input_tensors ) outputs = self._predict_arrays(input_arrays) return tuple(torch.from_numpy(output) for output in outputs) DEEPSPARSE_INFERENCE_LEARNERS: Dict[ DeepLearningFramework, Type[DeepSparseInferenceLearner] ] = {DeepLearningFramework.PYTORCH: PytorchDeepSparseInferenceLearner} ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/faster_transformer.py ================================================ from nebullvm.operations.inference_learners.torchscript import ( TorchScriptInferenceLearner, ) class FasterTransformerInferenceLearner(TorchScriptInferenceLearner): MODEL_NAME = "faster_transformer_model_scripted.pt" name = "FasterTransformer" ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/huggingface.py ================================================ from abc import ABC from collections import OrderedDict from pathlib import Path from typing import List, Any, Dict, Union from nebullvm.operations.inference_learners.base import ( InferenceLearnerWrapper, PytorchBaseInferenceLearner, LearnerMetadata, BaseInferenceLearner, ) from nebullvm.optional_modules.diffusers import StableDiffusionPipeline from nebullvm.optional_modules.torch import torch from nebullvm.tools.diffusers import postprocess_diffusers from nebullvm.tools.huggingface import restructure_output from nebullvm.tools.pytorch import get_torch_model_size class HuggingFaceInferenceLearner(InferenceLearnerWrapper): """Class wrapping an InferenceLearner model and giving to it the huggingface interface. The class fuse both the InterfaceLearner and HuggingFace interfaces, giving to the final user a model which can be used whit the prefered API without the need of adapting the previous code. Attributes: network_parameters (ModelParams): Model parameters of the model. core_inference_learner (PytorchBaseInferenceLearner): Inference learner built using the Pytorch interface. output_structure (Dict): Original output structure of the HuggingFace model. input_names (List[str]): List of all the input keys used for the original HuggingFace model. output_type (Any, optional): Original output type of the HuggingFace model. """ @property def name(self) -> str: return self.core_inference_learner.name def __init__( self, core_inference_learner: PytorchBaseInferenceLearner, output_structure: OrderedDict, input_names: List[str], output_type: Any = None, ): super().__init__(core_inference_learner) self.output_structure = output_structure self.input_names = input_names self.output_type = output_type def _save_wrapper_extra_info(self): pass def get_size(self): return self.core_inference_learner.get_size() @staticmethod def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict: return builder_inputs def run(self, *args, **kwargs) -> Any: """Run the underlying optimized model for getting a prediction. The method has an hybrid interface. It accepts inputs either as positional or keyword arguments. If only positional arguments are given the method expects the inputs to be in the canonical nebullvm interface. If only keyword arguments are given the method expects them to be in the HuggingFace interface. Mixed representation is not allowed and will result in an error. """ if len(args) > 0 and len(kwargs) > 0: raise RuntimeError( "Not allowed usage of the predict method. " "Either the positional or the keyword arguments must be given." ) if len(args) > 0: return self.core_inference_learner(*args) inputs = (kwargs.pop(name) for name in self.input_names) outputs = self.core_inference_learner(*inputs) if self.output_type is tuple: return outputs else: return restructure_output( outputs, self.output_structure, self.output_type ) def _get_extra_metadata_kwargs(self) -> Dict: metadata_kwargs = { "output_structure": self.output_structure, "output_structure_keys": list(self.output_structure.keys()), "input_names": self.input_names, } if self.output_type is not None: metadata_kwargs.update( { "output_type": self.output_type.__name__, "output_type_module": self.output_type.__module__, } ) return metadata_kwargs @staticmethod def _convert_metadata_to_inputs(metadata: LearnerMetadata) -> Dict: # we need to guarantee the preservation of the output structure # elements order. output_structure = OrderedDict() for key in metadata["output_structure_keys"]: output_structure[key] = metadata["output_structure"][key] inputs = { "output_structure": output_structure, "input_names": metadata["input_names"], } if metadata["output_type"] is not None: exec( f"from {metadata['output_type_module']} " f"import {metadata['output_type']}" ) inputs["output_type"] = eval(metadata["output_type"]) return inputs class DiffusionInferenceLearner(BaseInferenceLearner, ABC): @property def name(self) -> str: return self.pipeline.unet.model.name def __init__(self, pipeline: StableDiffusionPipeline): self.pipeline = pipeline def __call__(self, *args, **kwargs): return self.pipeline(*args, **kwargs) def run(self, *args, **kwargs) -> Any: self.pipeline(*args, **kwargs) def save(self, path: Union[str, Path], **kwargs): self.pipeline.unet.model.save(path) @classmethod def load( cls, path: Union[Path, str], **kwargs, ): try: pipe = kwargs["pipe"] except KeyError: raise TypeError("Missing required argument 'pipe'") optimized_model = LearnerMetadata.read(path).load_model(path) return postprocess_diffusers( optimized_model, pipe, optimized_model.device, ) def get_size(self): ( self.pipeline.unet.model.get_size() + sum( [ get_torch_model_size(v) for (k, v) in self.pipeline.__dict__.items() if isinstance(v, torch.nn.Module) and k != "unet" ] ) / 1e6 ) def free_gpu_memory(self): raise self.pipeline.unet.model.free_gpu_memory() def get_inputs_example(self): raise NotImplementedError() @property def output_format(self): return ".pt" @property def input_format(self): return ".pt" def list2tensor(self, listified_tensor: List) -> Any: raise NotImplementedError() ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/neural_compressor.py ================================================ from abc import ABC from pathlib import Path from typing import Union, Tuple, Dict, Type from loguru import logger from nebullvm.core.models import Device, ModelParams, DeepLearningFramework from nebullvm.operations.inference_learners.base import ( BaseInferenceLearner, LearnerMetadata, PytorchBaseInferenceLearner, ) from nebullvm.optional_modules.neural_compressor import ( cfgs_to_fx_cfgs, cfg_to_qconfig, ) from nebullvm.optional_modules.torch import ( torch, prepare_fx, convert_fx, Module, GraphModule, ) from nebullvm.tools.pytorch import ( save_with_torch_fx, load_with_torch_fx, create_model_inputs_torch, get_torch_model_size, ) from nebullvm.tools.transformations import MultiStageTransformation from nebullvm.tools.utils import check_module_version class NeuralCompressorInferenceLearner(BaseInferenceLearner, ABC): """Model optimized on CPU using IntelNeuralCompressor. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. model (torch.fx.GraphModule): Torch fx graph model. """ name = "IntelNeuralCompressor" def __init__( self, model: Union[Module, GraphModule], model_quant: GraphModule, device: Device, **kwargs, ): super().__init__(**kwargs) self.model = model self.model_quant = model_quant self.device = device def get_size(self): return get_torch_model_size(self.model_quant) + get_torch_model_size( self.model ) def save(self, path: Union[str, Path], **kwargs): """Save the model. Args: path (Path or str): Path to the directory where the model will be stored. kwargs (Dict): Dictionary of key-value pairs that will be saved in the model metadata file. """ metadata = LearnerMetadata.from_model(self, **kwargs) metadata.save(path) path_orig_model = Path(path) / Path("model_orig") path_quant_model = Path(path) / Path("model_quant") save_with_torch_fx(self.model, path_orig_model) self.model_quant.save(str(path_quant_model)) @classmethod def load(cls, path: Union[Path, str], **kwargs): """Load the model. Args: path (Path or str): Path to the directory where the model is stored. kwargs (Dict): Dictionary of additional arguments for consistency with other Learners. Returns: DeepSparseInferenceLearner: The optimized model. """ if len(kwargs) > 0: logger.warning( f"No extra keywords expected for the load method. " f"Got {kwargs}." ) metadata = LearnerMetadata.read(path) input_tfms = metadata.input_tfms if input_tfms is not None: input_tfms = MultiStageTransformation.from_dict( metadata.input_tfms ) network_parameters = ModelParams(**metadata.network_parameters) path_orig_model = Path(path) / Path("model_orig") path_quant_model = Path(path) / Path("model_quant") / "best_model.pt" model = load_with_torch_fx( Path(path_orig_model), "state_dict.pt" ).eval() state_dict = torch.load(path_quant_model) tune_cfg = state_dict.pop("best_configure") op_cfgs = cfg_to_qconfig(tune_cfg, tune_cfg["approach"]) fx_op_cfgs = cfgs_to_fx_cfgs(op_cfgs, tune_cfg["approach"]) additional_arguments = {} if check_module_version(torch, min_version="1.13.0"): additional_arguments["example_inputs"] = tuple( create_model_inputs_torch( input_infos=network_parameters.input_infos, ) ) q_model = prepare_fx( model, fx_op_cfgs, **additional_arguments, ) q_model = convert_fx(q_model) q_model.load_state_dict(state_dict) device = Device.from_str(metadata.device) return cls( model=model, model_quant=q_model, device=device, input_tfms=input_tfms, network_parameters=ModelParams(**metadata.network_parameters), ) class PytorchNeuralCompressorInferenceLearner( NeuralCompressorInferenceLearner, PytorchBaseInferenceLearner ): """Model optimized on CPU using IntelNeuralCompressor. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. model (torch.fx.GraphModule): Torch fx graph model. """ def free_gpu_memory(self): raise NotImplementedError( "NeuralCompressor does not support GPU inference." ) def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ inputs = (t.cpu() for t in input_tensors) outputs = self.model_quant(*inputs) if isinstance(outputs, torch.Tensor): outputs = (outputs,) return outputs NEURAL_COMPRESSOR_INFERENCE_LEARNERS: Dict[ DeepLearningFramework, Type[NeuralCompressorInferenceLearner] ] = {DeepLearningFramework.PYTORCH: PytorchNeuralCompressorInferenceLearner} ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/onnx.py ================================================ import multiprocessing import os import shutil from abc import ABC from pathlib import Path from typing import Union, List, Generator, Tuple, Dict, Type import cpuinfo import numpy as np from loguru import logger from nebullvm.config import ( ONNX_FILENAMES, ONNX_PROVIDERS, ) from nebullvm.core.models import ( QuantizationType, Device, DeviceType, ModelParams, DeepLearningFramework, ) from nebullvm.operations.inference_learners.base import ( BaseInferenceLearner, LearnerMetadata, PytorchBaseInferenceLearner, TensorflowBaseInferenceLearner, NumpyBaseInferenceLearner, ) from nebullvm.operations.optimizations.compilers.utils import ( tensorrt_is_available, ) from nebullvm.optional_modules.onnx import onnx from nebullvm.optional_modules.onnxruntime import onnxruntime as ort from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.tools.transformations import MultiStageTransformation def _running_on_intel_cpu(use_gpu): if use_gpu: return False # running on GPU cpu_info = cpuinfo.get_cpu_info()["brand_raw"].lower() if "intel" in cpu_info: return True return False def _get_ort_session_options(use_gpu) -> ort.SessionOptions: sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ( ort.GraphOptimizationLevel.ORT_ENABLE_ALL ) if not use_gpu: sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL sess_options.inter_op_num_threads = 1 sess_options.intra_op_num_threads = max( int( os.environ.get("NEBULLVM_THREADS_PER_MODEL") or multiprocessing.cpu_count() ), 1, ) return sess_options class ONNXInferenceLearner(BaseInferenceLearner, ABC): """Model converted to ONNX and run with Microsoft's onnxruntime. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. onnx_path (str or Path): Path to the onnx model. input_names (List[str]): Input names used when the onnx model was produced. output_names (List[str]): Output names used when the onnx model was produced. """ name = "ONNXRuntime" def __init__( self, onnx_path: Union[str, Path], input_names: List[str], output_names: List[str], device: Device, quantization_type: QuantizationType, **kwargs, ): super().__init__(**kwargs) filename = Path(onnx_path).name dir_path = str(Path(onnx_path).parent) self.device = device self.onnx_path = Path(self._store_dir(dir_path)) / filename self.sess_options = _get_ort_session_options( self.device.type is DeviceType.GPU ) self.quantization_type = quantization_type if _running_on_intel_cpu(self.device.type is DeviceType.GPU): self.sess_options.add_session_config_entry( "session.set_denormal_as_zero", "1" ) self.set_model_on_gpu() self._is_gpu_ready = self.device.type is DeviceType.GPU self.input_names = input_names self.output_names = output_names @staticmethod def _setup_tensorrt(quantization_type: QuantizationType, device: Device): if ( tensorrt_is_available() and os.environ.get("LD_LIBRARY_PATH", False) and "tensorrt" in os.environ["LD_LIBRARY_PATH"] ): ONNX_PROVIDERS["cuda"][0] = ( "TensorrtExecutionProvider", { "device_id": device.idx, "trt_max_workspace_size": device.get_free_memory(), "trt_fp16_enable": True if quantization_type is not None else False, "trt_int8_enable": True if quantization_type is QuantizationType.STATIC else False, }, ) else: if tensorrt_is_available(): logger.warning( "TensorrtExecutionProvider for onnx is not " "available. If you want to use it, please " "add the path to tensorrt to the " "LD_LIBRARY_PATH environment variable. " "CUDA provider will be used instead. " ) else: logger.warning( "TensorRT is not available. " "If you want to use it, please install it and " "add the path to the LD_LIBRARY_PATH " "environment variable." "CUDA provider will be used instead. " ) if "TensorrtExecutionProvider" in ONNX_PROVIDERS["cuda"]: ONNX_PROVIDERS["cuda"].remove("TensorrtExecutionProvider") def get_size(self): return sum( os.path.getsize(self.onnx_path.parents[0] / f) for f in os.listdir(self.onnx_path.parents[0]) if os.path.isfile(self.onnx_path.parents[0] / f) ) def free_gpu_memory(self): del self._session self._is_gpu_ready = False def set_model_on_gpu(self): if ( self.device.type is DeviceType.GPU and len(ONNX_PROVIDERS["cuda"]) == 3 ): ONNX_PROVIDERS["cuda"][1] = ( "CUDAExecutionProvider", { "device_id": self.device.idx, }, ) self._setup_tensorrt(self.quantization_type, self.device) ort_session = ort.InferenceSession( str(self.onnx_path), sess_options=self.sess_options, providers=ONNX_PROVIDERS["cuda"] if self.device.type is DeviceType.GPU else ONNX_PROVIDERS["cpu"], ) self._session = ort_session self._is_gpu_ready = True def save(self, path: Union[str, Path], **kwargs): """Save the model. Args: path (Path or str): Path to the directory where the model will be stored. kwargs (Dict): Dictionary of key-value pairs that will be saved in the model metadata file. """ metadata = LearnerMetadata.from_model( self, input_names=self.input_names, output_names=self.output_names, **kwargs, ) path = Path(path) path.mkdir(exist_ok=True) metadata.save(path) shutil.copy( self.onnx_path, os.path.join(str(path), ONNX_FILENAMES["model_name"]), ) try: # Tries to load the model onnx.load(os.path.join(str(path), ONNX_FILENAMES["model_name"])) except FileNotFoundError: # If missing files, it means it's saved in onnx external_data # format src_dir = str(Path(self.onnx_path).parent) files = os.listdir(src_dir) for fname in files: if ".onnx" not in fname: shutil.copy2( os.path.join(src_dir, fname), os.path.join(path, fname) ) @classmethod def load(cls, path: Union[Path, str], **kwargs): """Load the model. Args: path (Path or str): Path to the directory where the model is stored. kwargs (Dict): Dictionary of additional arguments for consistency with other Learners. Returns: ONNXInferenceLearner: The optimized model. """ if len(kwargs) > 0: logger.warning( f"No extra keywords expected for the load method. " f"Got {kwargs}." ) path = Path(path) onnx_path = path / ONNX_FILENAMES["model_name"] metadata = LearnerMetadata.read(path) input_tfms = metadata.input_tfms device = Device.from_str(metadata.device) quantization_type = ( QuantizationType(metadata.quantization_type) if hasattr(metadata, "quantization_type") else None ) if input_tfms is not None: input_tfms = MultiStageTransformation.from_dict( metadata.input_tfms ) return cls( input_tfms=input_tfms, network_parameters=ModelParams(**metadata.network_parameters), onnx_path=onnx_path, input_names=metadata["input_names"], output_names=metadata["output_names"], device=device, quantization_type=quantization_type, ) def _predict_arrays(self, input_arrays: Generator[np.ndarray, None, None]): input_dict = { name: input_array for name, input_array in zip(self.input_names, input_arrays) } outputs = self._session.run(self.output_names, input_dict) return outputs class PytorchONNXInferenceLearner( ONNXInferenceLearner, PytorchBaseInferenceLearner ): """Model run with Microsoft's onnxruntime using a Pytorch interface. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. onnx_path (str or Path): Path to the onnx model. input_names (List[str]): Input names used when the onnx model was produced. output_names (List[str]): Output names used when the onnx model was produced. """ def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ if self.device.type is DeviceType.GPU and not self._is_gpu_ready: self.set_model_on_gpu() input_arrays = ( input_tensor.cpu().detach().numpy() for input_tensor in input_tensors ) outputs = self._predict_arrays(input_arrays) return tuple( torch.from_numpy(output).to(self.device.to_torch_format()) for output in outputs ) class TensorflowONNXInferenceLearner( ONNXInferenceLearner, TensorflowBaseInferenceLearner ): """Model run with Microsoft's onnxruntime using a tensorflow interface. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. onnx_path (str or Path): Path to the onnx model. input_names (List[str]): Input names used when the onnx model was produced. output_names (List[str]): Output names used when the onnx model was produced. """ def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ if self.device.type is DeviceType.GPU and not self._is_gpu_ready: self.set_model_on_gpu() input_arrays = ( input_tensor.numpy() if not isinstance(input_tensor, np.ndarray) else input_tensor for input_tensor in input_tensors ) outputs = self._predict_arrays(input_arrays) # noinspection PyTypeChecker return tuple(tf.convert_to_tensor(output) for output in outputs) class NumpyONNXInferenceLearner( ONNXInferenceLearner, NumpyBaseInferenceLearner ): """Model run with Microsoft's onnxruntime using a numpy interface. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. onnx_path (str or Path): Path to the onnx model. input_names (List[str]): Input names used when the onnx model was produced. output_names (List[str]): Output names used when the onnx model was produced. """ def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[np.ndarray, ...]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ if self.device.type is DeviceType.GPU and not self._is_gpu_ready: self.set_model_on_gpu() input_arrays = (input_tensor for input_tensor in input_tensors) outputs = self._predict_arrays(input_arrays) return tuple(outputs) ONNX_INFERENCE_LEARNERS: Dict[ DeepLearningFramework, Type[ONNXInferenceLearner] ] = { DeepLearningFramework.PYTORCH: PytorchONNXInferenceLearner, DeepLearningFramework.TENSORFLOW: TensorflowONNXInferenceLearner, DeepLearningFramework.NUMPY: NumpyONNXInferenceLearner, } ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/openvino.py ================================================ import json import shutil from abc import ABC from pathlib import Path from typing import Dict, Union, Type, Generator, Tuple, List, Optional import numpy as np from loguru import logger from nebullvm.config import OPENVINO_FILENAMES from nebullvm.core.models import Device, ModelParams, DeepLearningFramework from nebullvm.operations.inference_learners.base import ( BaseInferenceLearner, LearnerMetadata, PytorchBaseInferenceLearner, TensorflowBaseInferenceLearner, NumpyBaseInferenceLearner, ) from nebullvm.optional_modules.openvino import ( Core, Model, CompiledModel, InferRequest, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation class OpenVinoInferenceLearner(BaseInferenceLearner, ABC): """Model optimized using OpenVINO. The class cannot be directly instantiated, but implements all the core methods needed for using OpenVINO at inference time. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. exec_network (any): The graph executor. This is the central component in the OpenVino optimized model execution. input_keys (List): Keys associated to the inputs. output_keys (List): Keys associated to the outputs. description_file (str): File containing a description of the optimized model. weights_file (str): File containing the model weights. """ MODEL_NAME = "model.bin" name = "OpenVINO" def __init__( self, compiled_model: CompiledModel, infer_request: InferRequest, input_keys: List, output_keys: List, description_file: str, weights_file: str, device: Device, **kwargs, ): super().__init__(**kwargs) self.compiled_model = compiled_model self.infer_request = infer_request self.input_keys = input_keys self.output_keys = output_keys self.device = device self.description_file = self._store_file(description_file) self.weights_file = self._store_file(weights_file) @classmethod def load(cls, path: Union[Path, str], **kwargs): """Load the model. Args: path (Path or str): Path to the directory where the model is stored. kwargs (Dict): Dictionary of additional arguments for the `from_model_name` class method. Returns: OpenVinoInferenceLearner: The optimized model. """ path = Path(path) with open(path / OPENVINO_FILENAMES["metadata"], "r") as fin: metadata = json.load(fin) metadata.update(kwargs) metadata["network_parameters"] = ModelParams( **metadata["network_parameters"] ) input_tfms = metadata.get("input_tfms") if input_tfms is not None: metadata["input_tfms"] = MultiStageTransformation.from_dict( input_tfms ) model_name = str(path / OPENVINO_FILENAMES["description_file"]) model_weights = str(path / OPENVINO_FILENAMES["weights"]) metadata["device"] = Device.from_str(metadata["device"]) return cls.from_model_name( model_name=model_name, model_weights=model_weights, **metadata ) def get_size(self): return len(self.compiled_model.export_model()) def free_gpu_memory(self): raise NotImplementedError("OpenVino does not support GPU inference.") @classmethod def from_model_name( cls, network_parameters: ModelParams, model_name: str, model_weights: str, device: Device, input_tfms: MultiStageTransformation = None, input_data: DataManager = None, **kwargs, ): """Build the optimized model from the network description and its weights. Args: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. model_name (str): File containing a description of the optimized model. model_weights (str): File containing the model weights. device (Device): Device used to run the model. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. input_data (DataManager, optional): User defined data. """ if len(kwargs) > 0: logger.warning(f"Found extra parameters: {kwargs}") core = Core() model = core.read_model(model=model_name, weights=model_weights) dynamic_shape = cls._get_dynamic_shape(model, network_parameters) if dynamic_shape is not None: model.reshape(dynamic_shape) compiled_model = core.compile_model(model=model, device_name="CPU") infer_request = compiled_model.create_infer_request() input_keys = list( map(lambda obj: obj.get_any_name(), compiled_model.inputs) ) output_keys = list( map(lambda obj: obj.get_any_name(), compiled_model.outputs) ) return cls( compiled_model, infer_request, input_keys, output_keys, input_tfms=input_tfms, network_parameters=network_parameters, description_file=model_name, weights_file=model_weights, input_data=input_data, device=device, ) @staticmethod def _get_dynamic_shape( model: Model, network_parameters: ModelParams ) -> Optional[Dict[str, Tuple[int]]]: if network_parameters.dynamic_info is None: return None input_names = [ list(model_input.names)[0] for model_input in model.inputs ] input_shapes = [ input_info.size for input_info in network_parameters.input_infos ] dynamic_shapes = [] assert len(input_shapes) == len( network_parameters.dynamic_info.inputs ), ( f"Number of inputs defined in dynamic info " f"({len(input_shapes)}) is different from the one " f"expected from the model " f"({len(network_parameters.dynamic_info.inputs)})." ) for input_shape, dynamic_shape_dict in zip( input_shapes, network_parameters.dynamic_info.inputs ): input_shape = list(input_shape) for key in dynamic_shape_dict.keys(): input_shape[int(key)] = -1 dynamic_shapes.append(tuple(input_shape)) dynamic_shape_dict = { k: v for k, v in zip(input_names, dynamic_shapes) } return dynamic_shape_dict def _get_metadata(self, **kwargs) -> LearnerMetadata: # metadata = { # key: self.__dict__[key] for key in ("input_keys", "output_keys") # } metadata = {} metadata.update(kwargs) return LearnerMetadata.from_model(self, **metadata) def save(self, path: Union[str, Path], **kwargs): """Save the model. Args: path (Path or str): Path to the directory where the model will be stored. kwargs (Dict): Dictionary of key-value pairs that will be saved in the model metadata file. """ path = Path(path) path.mkdir(exist_ok=True) metadata = self._get_metadata(**kwargs) metadata.save(path) shutil.copy( self.description_file, path / OPENVINO_FILENAMES["description_file"], ) shutil.copy(self.weights_file, path / OPENVINO_FILENAMES["weights"]) def _predict_array( self, input_arrays: Generator[np.ndarray, None, None], ) -> Generator[np.ndarray, None, None]: results = self.infer_request.infer( inputs={ input_key: input_array for input_key, input_array in zip( self.input_keys, input_arrays ) } ) results = { output_key.get_any_name(): output_arr for output_key, output_arr in results.items() } return (results[output_key] for output_key in self.output_keys) class PytorchOpenVinoInferenceLearner( OpenVinoInferenceLearner, PytorchBaseInferenceLearner ): """Model optimized using ApacheTVM with a Pytorch interface. This class can be used exactly in the same way as a pytorch Module object. At prediction time it takes as input pytorch tensors given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. exec_network (any): The graph executor. This is the central component in the OpenVino optimized model execution. input_keys (List): Keys associated to the inputs. output_keys (List): Keys associated to the outputs. description_file (str): File containing a description of the optimized model. weights_file (str): File containing the model weights. """ def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ input_arrays = ( input_tensor.cpu().detach().numpy() for input_tensor in input_tensors ) output_arrays = self._predict_array(input_arrays) return tuple( torch.from_numpy(output_array) for output_array in output_arrays ) class TensorflowOpenVinoInferenceLearner( OpenVinoInferenceLearner, TensorflowBaseInferenceLearner ): """Model optimized using ApacheTVM with a tensorflow interface. This class can be used exactly in the same way as a tf.Module or keras.Model object. At prediction time it takes as input tensorflow tensors given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. exec_network (any): The graph executor. This is the central component in the OpenVino optimized model execution. input_keys (List): Keys associated to the inputs. output_keys (List): Keys associated to the outputs. description_file (str): File containing a description of the optimized model. weights_file (str): File containing the model weights. """ def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ input_arrays = (input_tensor.numpy() for input_tensor in input_tensors) output_arrays = self._predict_array(input_arrays) # noinspection PyTypeChecker return tuple( tf.convert_to_tensor(output_array) for output_array in output_arrays ) class NumpyOpenVinoInferenceLearner( OpenVinoInferenceLearner, NumpyBaseInferenceLearner ): """Model optimized using ApacheTVM with a numpy interface. This class can be used exactly in the same way as a sklearn or numpy-based model. At prediction time it takes as input numpy arrays given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. exec_network (any): The graph executor. This is the central component in the OpenVino optimized model execution. input_keys (List): Keys associated to the inputs. output_keys (List): Keys associated to the outputs. description_file (str): File containing a description of the optimized model. weights_file (str): File containing the model weights. """ def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[np.ndarray]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[np.ndarray]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ input_arrays = (input_tensor for input_tensor in input_tensors) output_arrays = self._predict_array(input_arrays) return tuple(output_arrays) OPENVINO_INFERENCE_LEARNERS: Dict[ DeepLearningFramework, Type[OpenVinoInferenceLearner] ] = { DeepLearningFramework.PYTORCH: PytorchOpenVinoInferenceLearner, DeepLearningFramework.TENSORFLOW: TensorflowOpenVinoInferenceLearner, DeepLearningFramework.NUMPY: NumpyOpenVinoInferenceLearner, } ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tensor_rt.py ================================================ import json import os from abc import ABC from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Union, Dict, Type, List, Tuple, Generator, Optional import numpy as np from loguru import logger from nebullvm.config import NVIDIA_FILENAMES from nebullvm.core.models import ( Device, DeviceType, ModelParams, DeepLearningFramework, ) from nebullvm.operations.inference_learners.base import ( BaseInferenceLearner, LearnerMetadata, PytorchBaseInferenceLearner, TensorflowBaseInferenceLearner, NumpyBaseInferenceLearner, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.tensor_rt import tensorrt as trt, polygraphy from nebullvm.optional_modules.torch import torch, ScriptModule from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import ( MultiStageTransformation, VerifyContiguity, ) class ONNXTensorRTInferenceLearner(BaseInferenceLearner, ABC): """Model optimized using TensorRT. The class cannot be directly instantiated, but implements all the core methods needed for using TensorRT at inference time. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. engine (any): The tensorRT engine. input_names (List[str]): Names associated to the model input tensors. output_names (List[str]): Names associated to the model output tensors. cuda_stream (any, optional): Stream used for communication with Nvidia GPUs. nvidia_logger (any, optional): Logger used by the Nvidia service """ name = "TensorRT" def __init__( self, engine: Any, input_names: List[str], output_names: List[str], device: Device, cuda_stream: Any = None, nvidia_logger: Any = None, **kwargs, ): super().__init__(**kwargs) self.engine = engine self.context = self.engine.create_execution_context() self.input_names = input_names self.output_names = output_names self.cuda_stream = cuda_stream self.nvidia_logger = nvidia_logger self.output_tensors = None self.device = device self._set_cuda_env(device.type is DeviceType.GPU) def _get_metadata(self, **kwargs) -> LearnerMetadata: metadata = { key: self.__dict__[key] for key in ("input_names", "output_names") } metadata.update(kwargs) return LearnerMetadata.from_model(self, **metadata) def _synchronize_stream(self): raise NotImplementedError() @property def stream_ptr(self): raise NotImplementedError() @staticmethod def _get_default_cuda_stream() -> Any: raise NotImplementedError() @staticmethod def check_env(use_gpu): if not use_gpu: raise SystemError( "You are trying to run an optimizer developed for NVidia gpus " "on a machine not connected to any GPU supporting CUDA." ) def _set_cuda_env(self, use_gpu): self.check_env(use_gpu) if self.nvidia_logger is None: self.nvidia_logger = trt.Logger(trt.Logger.WARNING) if self.cuda_stream is None: self.cuda_stream = self._get_default_cuda_stream() @classmethod def from_engine_path( cls, network_parameters: ModelParams, engine_path: Union[str, Path], input_names: List[str], output_names: List[str], device: Device, nvidia_logger: Any = None, cuda_stream: Any = None, input_tfms: MultiStageTransformation = None, input_data: DataManager = None, **kwargs, ): """Build the model from the serialised engine. Args: network_parameters (ModelParams): Model parameters. engine_path (str or Path): Path to the serialised engine. The serialised engine is the serialised version of the engine used for accelerating the inference. input_names (List[str]): Names associated to the model input tensors. output_names (List[str]): Names associated to the model output tensors. device: (Device): Device where the model wil be run. cuda_stream (any, optional): Stream used for communication with Nvidia GPUs. nvidia_logger (any, optional): Logger used by the Nvidia service input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. input_data (DataManager, optional): User defined data. Returns: NvidiaInferenceLearner: The optimized model. """ if kwargs: logger.warning( f"Debug: Got extra keywords in " f"NvidiaInferenceLearner::from_engine_path: {kwargs}" ) if nvidia_logger is None: nvidia_logger = trt.Logger(trt.Logger.WARNING) if input_tfms is None: input_tfms = MultiStageTransformation([]) input_tfms.append(VerifyContiguity()) runtime = trt.Runtime(nvidia_logger) with open(engine_path, "rb") as f: serialized_engine = f.read() engine = runtime.deserialize_cuda_engine(serialized_engine) return cls( input_tfms=input_tfms, network_parameters=network_parameters, engine=engine, input_names=input_names, output_names=output_names, nvidia_logger=nvidia_logger, cuda_stream=cuda_stream, input_data=input_data, device=device, ) def _predict_tensors( self, input_ptrs: Generator[Any, None, None], output_ptrs: Generator[Any, None, None], input_shapes: Generator[Any, None, None] = None, ): buffers = [None] * (len(self.input_names) + len(self.output_names)) input_idxs = ( self.engine[input_name] for input_name in self.input_names ) output_idxs = ( self.engine[output_name] for output_name in self.output_names ) input_shapes = input_shapes or [None] * len(self.input_names) for input_idx, input_ptr, input_shape in zip( input_idxs, input_ptrs, input_shapes ): buffers[input_idx] = input_ptr if input_shape is not None: # If the input shape is empty, we set it to (1,) because # TensorRT doesn't accept empty shapes. if input_shape == torch.Size([]): input_shape = torch.Size((1,)) self.context.set_binding_shape(input_idx, input_shape) for output_idx, output_ptr in zip(output_idxs, output_ptrs): buffers[output_idx] = output_ptr self.context.execute_async_v2(buffers, self.stream_ptr) self._synchronize_stream() def get_size(self): return self.engine.serialize().nbytes def free_gpu_memory(self): # ONNXtensorrt doesn't need to release gpu memory pass def save(self, path: Union[str, Path], **kwargs): """Save the model. Args: path (Path or str): Path to the directory where the model will be stored. kwargs (Dict): Dictionary of key-value pairs that will be saved in the model metadata file. """ path = Path(path) path.mkdir(exist_ok=True) serialized_engine = self.engine.serialize() with open(path / NVIDIA_FILENAMES["engine"], "wb") as fout: fout.write(serialized_engine) metadata = self._get_metadata(**kwargs) with open(path / NVIDIA_FILENAMES["metadata"], "w") as fout: json.dump(metadata.to_dict(), fout) @classmethod def load(cls, path: Union[Path, str], **kwargs): """Load the model. Args: path (Path or str): Path to the directory where the model is stored. kwargs (Dict): Dictionary of additional arguments for the `from_engine_path` class method. Returns: ONNXTensorRTInferenceLearner: The optimized model. """ path = Path(path) with open(path / NVIDIA_FILENAMES["metadata"], "r") as fin: metadata = json.load(fin) metadata.update(kwargs) metadata["network_parameters"] = ModelParams( **metadata["network_parameters"] ) input_tfms = metadata.get("input_tfms") if input_tfms is not None: metadata["input_tfms"] = MultiStageTransformation.from_dict( input_tfms ) metadata["device"] = Device(DeviceType.GPU) return cls.from_engine_path( engine_path=path / NVIDIA_FILENAMES["engine"], **metadata, ) class PytorchTensorRTInferenceLearner(PytorchBaseInferenceLearner): MODEL_NAME = "model_optimized.pt" name = "TensorRT" def __init__( self, torch_model: ScriptModule, device: Device, **kwargs, ): super().__init__(**kwargs) self.model = torch_model.eval() if device.type is DeviceType.GPU: self.model.to(device.to_torch_format()) self.use_gpu = True else: self.use_gpu = False self.device = device self._is_gpu_ready = device.type is DeviceType.GPU def get_size(self): with TemporaryDirectory() as tmp_dir: self.save(tmp_dir) return sum( os.path.getsize(Path(tmp_dir) / f) for f in os.listdir(Path(tmp_dir)) if os.path.isfile(Path(tmp_dir) / f) ) def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]: if self.device.type is DeviceType.GPU and not self._is_gpu_ready: self.set_model_on_gpu() # PyTorch-TensorRT does not support int64 input_tensors = ( t.to(self.device.to_torch_format()) if t.dtype != torch.int64 else t.to(torch.int32).to(self.device.to_torch_format()) for t in input_tensors ) with torch.no_grad(): res = self.model(*input_tensors) if not isinstance(res, tuple): res = res.to(self.device.to_torch_format()) return (res,) return tuple(out.to(self.device.to_torch_format()) for out in res) def save(self, path: Union[str, Path], **kwargs): path = Path(path) path.mkdir(exist_ok=True) metadata = LearnerMetadata.from_model(self, **kwargs) metadata.save(path) torch.jit.save(self.model, path / self.MODEL_NAME) @classmethod def load(cls, path: Union[Path, str], **kwargs): path = Path(path) model = torch.jit.load(path / cls.MODEL_NAME) metadata = LearnerMetadata.read(path) device = Device(DeviceType.GPU) return cls( torch_model=model, network_parameters=ModelParams(**metadata.network_parameters), input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms) if metadata.input_tfms is not None else None, device=device, ) class PytorchONNXTensorRTInferenceLearner( ONNXTensorRTInferenceLearner, PytorchBaseInferenceLearner ): """Model optimized using TensorRT with a Pytorch interface. This class can be used exactly in the same way as a pytorch Module object. At prediction time it takes as input pytorch tensors given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. engine (any): The tensorRT engine. input_names (List[str]): Names associated to the model input tensors. output_names (List[str]): Names associated to the model output tensors. cuda_stream (any, optional): Stream used for communication with Nvidia GPUs. nvidia_logger (any, optional): Logger used by the Nvidia service. """ def _synchronize_stream(self): self.cuda_stream.synchronize() @staticmethod def _get_default_cuda_stream() -> Any: return torch.cuda.default_stream() @property def stream_ptr(self): return self.cuda_stream.cuda_stream def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ input_tensors = [ input_tensor.to(self.device.to_torch_format()) for input_tensor in input_tensors ] if self.network_parameters.dynamic_info is None: if self.output_tensors is None: self.output_tensors = [ torch.Tensor(*output_size) .to(self.device.to_torch_format()) .to(output_type.to_torch_format()) for output_size, output_type in zip( self.network_parameters.output_sizes, self.network_parameters.output_types, ) ] input_sizes = None else: dynamic_info = self.network_parameters.dynamic_info input_sizes = [ input_tensor.size() for input_tensor in input_tensors ] self.output_tensors = [ torch.Tensor( *( x if i not in dynamic_axis.keys() else dynamic_info.retrieve_output_dim( input_sizes, j, i, x ) for i, x in enumerate(output_size) ), ) .to(self.device.to_torch_format()) .to(output_type.to_torch_format()) for j, (output_size, output_type, dynamic_axis) in enumerate( zip( self.network_parameters.output_sizes, self.network_parameters.output_types, dynamic_info.outputs, ) ) ] input_ptrs = ( input_tensor.data_ptr() for input_tensor in input_tensors ) output_ptrs = ( output_tensor.data_ptr() for output_tensor in self.output_tensors ) self._predict_tensors(input_ptrs, output_ptrs, input_sizes) return tuple( output_tensor.to(self.device.to_torch_format()) for output_tensor in self.output_tensors ) class BaseArrayONNXTensorRTInferenceLearner(ONNXTensorRTInferenceLearner, ABC): """Base Model that can be used for all array-based NvidiaInferenceLearners. """ def _synchronize_stream(self): self.cuda_stream.synchronize() @staticmethod def _get_default_cuda_stream() -> Any: return polygraphy.cuda.Stream() @property def stream_ptr(self): return self.cuda_stream.ptr @staticmethod def _convert_to_array_and_free_memory(cuda_array) -> np.ndarray: array = cuda_array.numpy() cuda_array.free() return array def _predict_array( self, cuda_input_arrays: List, input_shapes: Optional[List[Tuple[int, ...]]], ) -> Generator[np.ndarray, None, None]: if self.network_parameters.dynamic_info is None: cuda_output_arrays = [ polygraphy.cuda.DeviceArray( shape=output_size, dtype=output_type.to_numpy_format(), ) for output_size, output_type in zip( self.network_parameters.output_sizes, self.network_parameters.output_types, ) ] else: dynamic_info = self.network_parameters.dynamic_info cuda_output_arrays = [ polygraphy.cuda.DeviceArray( shape=tuple( x if i not in dyn_out_axis.keys() else dynamic_info.retrieve_output_dim( input_shapes, j, i, x ) for i, x in enumerate(output_size) ), dtype=output_type.to_numpy_format(), ) for j, (output_size, output_type, dyn_out_axis) in enumerate( zip( self.network_parameters.output_sizes, self.network_parameters.output_types, dynamic_info.outputs, ) ) ] input_ptrs = (cuda_array.ptr for cuda_array in cuda_input_arrays) output_ptrs = (cuda_array.ptr for cuda_array in cuda_output_arrays) self._predict_tensors(input_ptrs, output_ptrs, input_shapes) for cuda_input_array in cuda_input_arrays: cuda_input_array.free() return ( self._convert_to_array_and_free_memory(array) for array in cuda_output_arrays ) class TensorflowONNXTensorRTInferenceLearner( BaseArrayONNXTensorRTInferenceLearner, TensorflowBaseInferenceLearner ): """Model optimized using TensorRT with a tensorflow interface. This class can be used exactly in the same way as a tf.Module or keras.Model object. At prediction time it takes as input tensorflow tensors given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. engine (any): The tensorRT engine. input_names (List[str]): Names associated to the model input tensors. output_names (List[str]): Names associated to the model output tensors. cuda_stream (any, optional): Stream used for communication with Nvidia GPUs. nvidia_logger (any, optional): Logger used by the Nvidia service. """ def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ cuda_input_arrays = [ polygraphy.cuda.DeviceArray( shape=tuple(input_tensor.shape), dtype=input_tensor.numpy().dtype, ).copy_from(input_tensor.numpy(), stream=self.cuda_stream) for input_tensor in input_tensors ] input_shapes = ( [tuple(input_tensor.shape) for input_tensor in input_tensors] if self.network_parameters.dynamic_info is not None else None ) out_arrays = self._predict_array(cuda_input_arrays, input_shapes) return tuple(tf.convert_to_tensor(array) for array in out_arrays) class NumpyONNXTensorRTInferenceLearner( BaseArrayONNXTensorRTInferenceLearner, NumpyBaseInferenceLearner ): """Model optimized using TensorRT with a tensorflow interface. This class can be used exactly in the same way as a tf.Module or keras.Model object. At prediction time it takes as input tensorflow tensors given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. engine (any): The tensorRT engine. input_names (List[str]): Names associated to the model input tensors. output_names (List[str]): Names associated to the model output tensors. cuda_stream (any, optional): Stream used for communication with Nvidia GPUs. nvidia_logger (any, optional): Logger used by the Nvidia service. """ def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[np.ndarray]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[np.ndarray]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ cuda_input_arrays = [ polygraphy.cuda.DeviceArray( shape=tuple(input_tensor.shape), dtype=input_tensor.dtype ).copy_from(input_tensor, stream=self.cuda_stream) for input_tensor in input_tensors ] input_shapes = ( [tuple(input_tensor.shape) for input_tensor in input_tensors] if self.network_parameters.dynamic_info is not None else None ) return tuple(self._predict_array(cuda_input_arrays, input_shapes)) TENSOR_RT_INFERENCE_LEARNERS: Dict[ DeepLearningFramework, Type[ONNXTensorRTInferenceLearner] ] = { DeepLearningFramework.PYTORCH: PytorchONNXTensorRTInferenceLearner, DeepLearningFramework.TENSORFLOW: TensorflowONNXTensorRTInferenceLearner, DeepLearningFramework.NUMPY: NumpyONNXTensorRTInferenceLearner, } ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tensorflow.py ================================================ import pickle from pathlib import Path from typing import Tuple, Union, Dict, Type from nebullvm.config import TENSORFLOW_BACKEND_FILENAMES from nebullvm.core.models import DeviceType, Device, ModelParams from nebullvm.operations.inference_learners.base import ( TensorflowBaseInferenceLearner, LearnerMetadata, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf class TensorflowBackendInferenceLearner(TensorflowBaseInferenceLearner): name = "XLA" def __init__(self, tf_model: tf.Module, device: Device, **kwargs): super(TensorflowBackendInferenceLearner, self).__init__(**kwargs) self.model = tf_model self.device = device self._is_gpu_ready = self.device.type is DeviceType.GPU def get_size(self): return len(pickle.dumps(self.model, -1)) def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]: if self.device.type is DeviceType.GPU and not self._is_gpu_ready: self.set_model_on_gpu() with tf.device(self.device.to_tf_format()): res = self.model(input_tensors) if not isinstance(res, tuple): return (res,) return res def save(self, path: Union[str, Path], **kwargs): path = Path(path) path.mkdir(exist_ok=True) metadata = LearnerMetadata.from_model(self, **kwargs) metadata.save(path) self.model.save(path / TENSORFLOW_BACKEND_FILENAMES["tf_model"]) @classmethod def load(cls, path: Union[Path, str], **kwargs): path = Path(path) metadata = LearnerMetadata.read(path) network_parameters = ModelParams(**metadata.network_parameters) input_tfms = metadata.input_tfms model = tf.keras.models.load_model( path / TENSORFLOW_BACKEND_FILENAMES["tf_model"] ) device = Device.from_str(metadata.device) return cls( tf_model=model, network_parameters=network_parameters, input_tfms=input_tfms, device=device, ) class TFLiteBackendInferenceLearner(TensorflowBaseInferenceLearner): name = "TFLite" def __init__(self, tflite_file: bytes, device: Device, **kwargs): super(TFLiteBackendInferenceLearner, self).__init__(**kwargs) self.tflite_file = tflite_file self.interpreter = tf.lite.Interpreter(model_content=tflite_file) self.device = device def get_size(self): return len(self.tflite_file) def free_gpu_memory(self): raise NotImplementedError( "TFLite does not support GPU inference on Nvidia devices" ) def run(self, *input_tensors: tf.Tensor): input_details = self.interpreter.get_input_details() output_details = self.interpreter.get_output_details() if self.network_parameters.dynamic_info: for i, (input_tensor, detail) in enumerate( zip(input_tensors, input_details) ): if input_tensor.shape != tuple(detail["shape"]): self.interpreter.resize_tensor_input(i, input_tensor.shape) self.interpreter.allocate_tensors() for i, input_tensor in enumerate(input_tensors): self.interpreter.set_tensor(i, input_tensor) self.interpreter.invoke() return tuple( tf.convert_to_tensor( self.interpreter.get_tensor(output_detail["index"]) ) for output_detail in output_details ) def save(self, path: Union[str, Path], **kwargs): path = Path(path) metadata = LearnerMetadata.from_model(self, **kwargs) metadata.save(path) with open( path / TENSORFLOW_BACKEND_FILENAMES["tflite_model"], "wb" ) as f: f.write(self.tflite_file) @classmethod def load(cls, path: Union[Path, str], **kwargs): path = Path(path) tflite_file_path = str( path / TENSORFLOW_BACKEND_FILENAMES["tflite_model"] ) with open(tflite_file_path, "rb") as f: tflite_file = f.read() metadata = LearnerMetadata.read(path) network_parameters = ModelParams(**metadata.network_parameters) input_tfms = metadata.input_tfms device = Device.from_str(metadata.device) return cls( tflite_file=tflite_file, network_parameters=network_parameters, input_tfms=input_tfms, device=device, ) TF_BACKEND_LEARNERS_DICT: Dict[ str, Type[ Union[TensorflowBackendInferenceLearner, TFLiteBackendInferenceLearner] ], ] = { "tf": TensorflowBackendInferenceLearner, "tflite": TFLiteBackendInferenceLearner, } ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_dynamo.py ================================================ from pathlib import Path from typing import Union from nebullvm.operations.inference_learners.torchscript import ( TorchScriptInferenceLearner, ) class TorchDynamoInferenceLearner(TorchScriptInferenceLearner): name = "TorchDynamo" def save(self, path: Union[str, Path], **kwargs): # TODO: Implement save function # Saving it like a normal PyTorch model raises this error: # https://github.com/pytorch/pytorch/issues/93470 raise NotImplementedError @classmethod def load(cls, path: Union[Path, str], **kwargs): # TODO: Implement load function raise NotImplementedError ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_neuron.py ================================================ import os from pathlib import Path from tempfile import TemporaryDirectory from nebullvm.operations.inference_learners.torchscript import ( TorchScriptInferenceLearner, ) class TorchNeuronInferenceLearner(TorchScriptInferenceLearner): name = "TorchNeuron" def get_size(self): with TemporaryDirectory() as tmp_dir: self.save(tmp_dir) return sum( os.path.getsize(Path(tmp_dir) / f) for f in os.listdir(Path(tmp_dir)) if os.path.isfile(Path(tmp_dir) / f) ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_xla.py ================================================ import os import pickle from pathlib import Path from tempfile import TemporaryDirectory from typing import Tuple, Union from nebullvm.core.models import Device, DeviceType, ModelParams from nebullvm.operations.inference_learners.base import ( PytorchBaseInferenceLearner, LearnerMetadata, ) from nebullvm.optional_modules.torch import ( torch, ) from nebullvm.tools.transformations import MultiStageTransformation class TorchXLAInferenceLearner(PytorchBaseInferenceLearner): MODEL_NAME = "model_scripted.pt" name = "TorchXLA" def __init__(self, torch_model: torch.nn.Module, device: Device, **kwargs): super().__init__(**kwargs) self.model = torch_model.eval() if device.type is DeviceType.TPU: self.model.to(device.to_torch_format()) self.device = device self._is_gpu_ready = self.device.type is DeviceType.TPU def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]: if self.device.type is DeviceType.TPU and not self._is_gpu_ready: self.set_model_on_gpu() if self.device.type is DeviceType.TPU: input_tensors = ( t.to(self.device.to_torch_format()) for t in input_tensors ) with torch.no_grad(): res = self.model(*input_tensors) if not isinstance(res, tuple): return (res,) return tuple(out for out in res) def get_size(self): try: if hasattr(self.model, "core_model"): return len(pickle.dumps(self.model.core_model, -1)) else: # Normal torch model return len(pickle.dumps(self.model, -1)) except RuntimeError: with TemporaryDirectory() as tmp_dir: self.save(tmp_dir) return sum( os.path.getsize(Path(tmp_dir) / f) for f in os.listdir(Path(tmp_dir)) if os.path.isfile(Path(tmp_dir) / f) ) def save(self, path: Union[str, Path], **kwargs): path = Path(path) path.mkdir(exist_ok=True) metadata = LearnerMetadata.from_model(self, **kwargs) metadata.save(path) self.model.cpu() torch.save(self.model, path / self.MODEL_NAME) @classmethod def load(cls, path: Union[Path, str], **kwargs): path = Path(path) model = torch.load(path / cls.MODEL_NAME) metadata = LearnerMetadata.read(path) device = Device.from_str(metadata.device) model.to(device.to_torch_format()) return cls( torch_model=model, network_parameters=ModelParams(**metadata.network_parameters), input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms) if metadata.input_tfms is not None else None, device=device, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torchscript.py ================================================ from pathlib import Path from typing import Tuple, Union, Optional, List from nebullvm.core.models import Device, DeviceType, ModelParams from nebullvm.operations.inference_learners.base import ( PytorchBaseInferenceLearner, LearnerMetadata, ) from nebullvm.optional_modules.torch import ( torch, symbolic_trace, Module, ScriptModule, GraphModule, ) from nebullvm.tools.transformations import MultiStageTransformation class TorchScriptInferenceLearner(PytorchBaseInferenceLearner): MODEL_NAME = "model_scripted.pt" name = "TorchScript" def __init__(self, torch_model: ScriptModule, device: Device, **kwargs): super().__init__(**kwargs) self.model = torch_model.eval() if device.type is DeviceType.GPU: self.model.to(device.to_torch_format()) self.device = device self._is_gpu_ready = self.device.type is DeviceType.GPU def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]: if self.device.type is DeviceType.GPU and not self._is_gpu_ready: self.set_model_on_gpu() if self.device.type is DeviceType.GPU: input_tensors = ( t.to(self.device.to_torch_format()) for t in input_tensors ) with torch.no_grad(): res = self.model(*input_tensors) if not isinstance(res, tuple): res = res.to(self.device.to_torch_format()) return (res,) return tuple(out.to(self.device.to_torch_format()) for out in res) def save(self, path: Union[str, Path], **kwargs): path = Path(path) path.mkdir(exist_ok=True) metadata = LearnerMetadata.from_model(self, **kwargs) metadata.save(path) torch.jit.save(self.model, path / self.MODEL_NAME) @classmethod def load(cls, path: Union[Path, str], **kwargs): path = Path(path) model = torch.jit.load(path / cls.MODEL_NAME) metadata = LearnerMetadata.read(path) device = Device.from_str(metadata.device) return cls( torch_model=model, network_parameters=ModelParams(**metadata.network_parameters), input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms) if metadata.input_tfms is not None else None, device=device, ) @classmethod def from_torch_model( cls, model: Union[Module, GraphModule], network_parameters: ModelParams, device: Device, input_tfms: Optional[MultiStageTransformation] = None, input_data: List[torch.Tensor] = None, ): if device.type is DeviceType.GPU: input_data = [t.to(device.to_torch_format()) for t in input_data] if not isinstance(model, torch.fx.GraphModule): model.eval() try: model_scripted = symbolic_trace(model) model_scripted = torch.jit.script(model_scripted) except Exception: try: model_scripted = torch.jit.script(model) except Exception: model_scripted = torch.jit.trace(model, tuple(input_data)) else: model_scripted = torch.jit.script(model) return cls( torch_model=model_scripted, network_parameters=network_parameters, input_tfms=input_tfms, input_data=input_data, device=device, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tvm.py ================================================ import os import shutil from abc import ABC from pathlib import Path from tempfile import TemporaryDirectory from typing import Union, Type, Dict, Any, List, Generator, Tuple, Optional import numpy as np from nebullvm.config import ( TVM_FILENAMES, ) from nebullvm.core.models import Device, ModelParams, DeepLearningFramework from nebullvm.operations.inference_learners.base import ( BaseInferenceLearner, LearnerMetadata, PytorchBaseInferenceLearner, TensorflowBaseInferenceLearner, NumpyBaseInferenceLearner, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.optional_modules.tvm import ( GraphModule, tvm, ExecutorFactoryModule, ) from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import ( MultiStageTransformation, HalfPrecisionTransformation, ) class ApacheTVMInferenceLearner(BaseInferenceLearner, ABC): """Model optimized using ApacheTVM. The class cannot be directly instantiated, but implements all the core methods needed for using ApacheTVM at inference time. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. graph_executor_module (GraphModule): The graph executor. This is the central component in the ApacheTVM optimized model execution. input_names (List[str]): Names associated to the model input tensors. lib (Module): Component needed for loading the ApacheTVM optimized model. target (str): Target device. It can be wither `llvm` for targeting CPUs or "cuda" for targeting GPUs. engine_path (Path, optional): Path to the serialized engine. To be used after loading the model (avoiding double engine serialization). """ name = "ApacheTVM" def __init__( self, graph_executor_module: GraphModule, input_names: List[str], lib: ExecutorFactoryModule, target: str, device: Device, engine_path: Path = None, **kwargs ): super().__init__(**kwargs) self.graph_executor_module = graph_executor_module self.input_names = input_names self.lib = lib self.target = target self.engine_path = ( self._store_file(engine_path) if engine_path is not None else engine_path ) self.device = device def get_size(self): with TemporaryDirectory() as tmp_dir: self.save(tmp_dir) return sum( os.path.getsize(Path(tmp_dir) / f) for f in os.listdir(Path(tmp_dir)) if os.path.isfile(Path(tmp_dir) / f) ) def _has_half_precision_transformation(self): for tfm in self.input_tfms.to_list(): if isinstance(tfm, HalfPrecisionTransformation): return True return False def _predict_array( self, input_arrays: Generator[np.ndarray, None, None] ) -> Generator[np.ndarray, None, None]: for name, array in zip(self.input_names, input_arrays): self.graph_executor_module.set_input(name, array) self.graph_executor_module.run() tvm_outputs = ( self.graph_executor_module.get_output( i, tvm.nd.empty( shape=output_size, dtype="float16" if self._has_half_precision_transformation() else "float32", ), ).numpy() for i, output_size in enumerate( self.network_parameters.output_sizes ) ) return tvm_outputs def free_gpu_memory(self): # TODO: check if tvm needs to release GPU pass def save(self, path: Union[str, Path], **kwargs): """Save the model. Args: path (Path or str): Path to the directory where the model will be stored. kwargs (Dict): Dictionary of key-value pairs that will be saved in the model metadata file. """ path = Path(path) path.mkdir(exist_ok=True) metadata = LearnerMetadata.from_model( self, input_names=self.input_names, target=self.target, **kwargs ) metadata.save(path) if self.engine_path is None: self.lib.export_library(path / TVM_FILENAMES["engine"]) else: shutil.copy(self.engine_path, path) @classmethod def load(cls, path: Union[Path, str], **kwargs): """Load the model. Args: path (Path or str): Path to the directory where the model is stored. kwargs (Dict): Dictionary of additional arguments for the `from_runtime_module` class method. Returns: ApacheTVMInferenceLearner: The optimized model. """ path = Path(path) metadata = LearnerMetadata.read(path).to_dict() network_parameters = ModelParams(**metadata["network_parameters"]) lib = tvm.runtime.load_module(path / TVM_FILENAMES["engine"]) target_device = metadata["target"] input_names = metadata["input_names"] input_tfms = metadata.get("input_tfms") if input_tfms is not None: metadata["input_tfms"] = MultiStageTransformation.from_dict( input_tfms ) device = Device.from_str(metadata["device"]) self = cls.from_runtime_module( network_parameters=network_parameters, lib=lib, target_device=target_device, input_names=input_names, device=device, ) self.engine_path = path / TVM_FILENAMES["engine"] return self @classmethod def from_runtime_module( cls, network_parameters: ModelParams, lib: ExecutorFactoryModule, target_device: str, input_names: List[str], device: Device, input_tfms: MultiStageTransformation = None, input_data: DataManager = None, ): """Build the model from the runtime module (lib). Args: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. lib (Module): Component needed for loading the ApacheTVM optimized model. target_device (str): The target device. Either `llvm` (CPU) or `cuda`. input_names (List[str]): Names associated to the model input tensors. device (Device): The device where the model will be executed. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. input_data (DataManager, optional): User defined data. """ dev = tvm.device(str(target_device), 0) graph_executor_module = GraphModule(lib["default"](dev)) return cls( input_tfms=input_tfms, network_parameters=network_parameters, graph_executor_module=graph_executor_module, input_names=input_names, lib=lib, target=target_device, input_data=input_data, device=device, ) class BaseArrayApacheTVMInferenceLearner(ApacheTVMInferenceLearner, ABC): """Base Model that can be used for all array-based ApacheTVMInferenceLearners. """ def _inner_predict( self, input_arrays: Generator[np.ndarray, None, None], input_shapes: Optional[List[Tuple[int, ...]]], ) -> Generator[np.ndarray, None, None]: if self.network_parameters.dynamic_info is not None: input_arrays = ( np.pad( input_array, [ (0, abs(x - y)) for x, y in zip( input_array.shape, input_size, ) ], mode="constant", constant_values=0, ) for input_array, input_size in zip( input_arrays, self.network_parameters.input_sizes ) ) output_arrays = self._predict_array(input_arrays) if self.network_parameters.dynamic_info is not None: assert input_shapes is not None dynamic_info = self.network_parameters.dynamic_info return ( output_array[ tuple( slice( 0, None if x not in out_dynamic_dict.keys() else dynamic_info.retrieve_output_dim( input_shapes, j, i, x ), ) for i, x in enumerate(output_array.shape) ) ] for j, (output_array, out_dynamic_dict) in enumerate( zip(output_arrays, dynamic_info.outputs) ) ) return output_arrays class PytorchApacheTVMInferenceLearner( BaseArrayApacheTVMInferenceLearner, PytorchBaseInferenceLearner ): """Model optimized using ApacheTVM with a Pytorch interface. This class can be used exactly in the same way as a pytorch Module object. At prediction time it takes as input pytorch tensors given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. graph_executor_module (GraphModule): The graph executor. This is the central component in the ApacheTVM optimized model execution. input_names (List[str]): Names associated to the model input tensors. lib (Module): Component needed for loading the ApacheTVM optimized model. target (str): Target device. It can be wither `llvm` for targeting CPUs or "cuda" for targeting GPUs. """ def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ input_arrays = ( input_tensor.cpu().detach().numpy() for input_tensor in input_tensors ) input_shapes = ( [tuple(input_tensor.shape) for input_tensor in input_tensors] if self.network_parameters.dynamic_info is not None else None ) output_arrays = self._inner_predict(input_arrays, input_shapes) return tuple( torch.from_numpy(array).to(self.device.to_torch_format()) for array in output_arrays ) @staticmethod def _convert_device(device: Any): if isinstance(device, int): return "cpu" return device class TensorflowApacheTVMInferenceLearner( BaseArrayApacheTVMInferenceLearner, TensorflowBaseInferenceLearner ): """Model optimized using ApacheTVM with a tensorflow interface. This class can be used exactly in the same way as a tf.Module or keras.Model object. At prediction time it takes as input tensorflow tensors given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. graph_executor_module (GraphModule): The graph executor. This is the central component in the ApacheTVM optimized model execution. input_names (List[str]): Names associated to the model input tensors. lib (Module): Component needed for loading the ApacheTVM optimized model. target (str): Target device. It can be wither `llvm` for targeting CPUs or "cuda" for targeting GPUs. """ def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[Tensor]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[Tensor]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ input_arrays = (input_tensor.numpy() for input_tensor in input_tensors) input_shapes = ( [tuple(input_tensor.shape) for input_tensor in input_tensors] if self.network_parameters.dynamic_info is not None else None ) return tuple( tf.convert_to_tensor(out) for out in self._inner_predict(input_arrays, input_shapes) ) class NumpyApacheTVMInferenceLearner( BaseArrayApacheTVMInferenceLearner, NumpyBaseInferenceLearner ): """Model optimized using ApacheTVM with a tensorflow interface. This class can be used exactly in the same way as a tf.Module or keras.Model object. At prediction time it takes as input tensorflow tensors given as positional arguments. Attributes: network_parameters (ModelParams): The model parameters as batch size, input and output sizes. graph_executor_module (GraphModule): The graph executor. This is the central component in the ApacheTVM optimized model execution. input_names (List[str]): Names associated to the model input tensors. lib (Module): Component needed for loading the ApacheTVM optimized model. target (str): Target device. It can be wither `llvm` for targeting CPUs or "cuda" for targeting GPUs. """ def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]: """Predict on the input tensors. Note that the input tensors must be on the same batch. If a sequence of tensors is given when the model is expecting a single input tensor (with batch size >= 1) an error is raised. Args: input_tensors (Tuple[ndarray]): Input tensors belonging to the same batch. The tensors are expected having dimensions (batch_size, dim1, dim2, ...). Returns: Tuple[ndarray]: Output tensors. Note that the output tensors does not correspond to the prediction on the input tensors with a 1 to 1 mapping. In fact the output tensors are produced as the multiple-output of the model given a (multi-) tensor input. """ input_arrays = (input_tensor for input_tensor in input_tensors) input_shapes = ( [tuple(input_tensor.shape) for input_tensor in input_tensors] if self.network_parameters.dynamic_info is not None else None ) return tuple(self._inner_predict(input_arrays, input_shapes)) APACHE_TVM_INFERENCE_LEARNERS: Dict[ DeepLearningFramework, Type[ApacheTVMInferenceLearner] ] = { DeepLearningFramework.PYTORCH: PytorchApacheTVMInferenceLearner, DeepLearningFramework.TENSORFLOW: TensorflowApacheTVMInferenceLearner, DeepLearningFramework.NUMPY: NumpyApacheTVMInferenceLearner, } ================================================ FILE: optimization/nebullvm/nebullvm/operations/inference_learners/utils.py ================================================ from pathlib import Path from typing import Union, Any from nebullvm.operations.inference_learners.base import LearnerMetadata from nebullvm.optional_modules.diffusers import StableDiffusionPipeline from nebullvm.tools.diffusers import postprocess_diffusers def load_model(path: Union[Path, str], pipe: StableDiffusionPipeline = None): """Load the optimized model previously saved in the given path. Args: path (Union[Path, str]): Path to the directory where the model is saved. pipe (StableDiffusionPipeline): Diffusion pipeline to be used for loading the model. This parameter is only needed if the model to be loaded is a diffusion model. Default: None. Returns: InferenceLearner: Model optimized by Speedster. """ optimized_model = LearnerMetadata.read(path).load_model(path) if pipe is not None: optimized_model = postprocess_diffusers( optimized_model, pipe, optimized_model.device ) return optimized_model def save_model(model: Any, path: Union[Path, str]): """Save the optimized model in the given path. Args: model (Any): Model to be saved. path (Union[Path, str]): Path to the directory where to save the model. Returns: InferenceLearner: Model optimized by Speedster. """ if isinstance(model, StableDiffusionPipeline): model.unet.model.save(path) else: model.save(path) ================================================ FILE: optimization/nebullvm/nebullvm/operations/measures/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/measures/base.py ================================================ import abc from nebullvm.operations.base import Operation class Measure(Operation, abc.ABC): def __init__(self): super().__init__() self.measure_result = None @abc.abstractmethod def execute(self, **kwargs): raise NotImplementedError() ================================================ FILE: optimization/nebullvm/nebullvm/operations/measures/measures.py ================================================ from typing import List, Tuple, Any, Callable, Dict import numpy as np from nebullvm.config import QUANTIZATION_DATA_NUM from nebullvm.core.models import ( BenchmarkOriginalModelResult, DeepLearningFramework, ) from nebullvm.operations.inference_learners.base import BaseInferenceLearner from nebullvm.operations.measures.base import Measure from nebullvm.operations.measures.utils import ( compute_torch_latency, compute_tf_latency, compute_onnx_latency, compute_relative_difference, ) from nebullvm.tools.data import DataManager from nebullvm.tools.onnx import run_onnx_model from nebullvm.tools.pytorch import run_torch_model from nebullvm.tools.tf import run_tf_model COMPUTE_OUTPUT_FRAMEWORK: Dict[DeepLearningFramework, Callable] = { DeepLearningFramework.PYTORCH: run_torch_model, DeepLearningFramework.TENSORFLOW: run_tf_model, DeepLearningFramework.NUMPY: run_onnx_model, } COMPUTE_LATENCY_FRAMEWORK: Dict[DeepLearningFramework, Callable] = { DeepLearningFramework.PYTORCH: compute_torch_latency, DeepLearningFramework.TENSORFLOW: compute_tf_latency, DeepLearningFramework.NUMPY: compute_onnx_latency, } class MetricDropMeasure(Measure): def __init__(self): super().__init__() self.valid = None def execute( self, optimized_learner: BaseInferenceLearner, input_data: List[Tuple[Any, ...]], base_outputs_list: List[Tuple[Any, ...]], perf_loss_ths: float, metric_func: Callable = None, ys: List = None, aggregation_func: Callable = np.mean, ): metric_func = metric_func or compute_relative_difference relative_differences = [] if ys is None: ys = [None] * len(input_data) assert len(input_data) == len(base_outputs_list) == len(ys), ( "INTERNAL ASSERT FAILED: error during computation of precision " "of the optimized model, got wrong dimensions of the data. " ) for inputs, base_outputs, y in zip(input_data, base_outputs_list, ys): opt_outputs = optimized_learner(*inputs) relative_difference = max( metric_func(base_output, opt_output, y) for base_output, opt_output in zip(base_outputs, opt_outputs) ) relative_differences.append(relative_difference) relative_difference = aggregation_func(relative_differences) self.valid = relative_difference <= perf_loss_ths self.measure_result = relative_difference def get_result(self) -> Tuple[bool, float]: return self.valid, self.measure_result class LatencyOriginalModelMeasure(Measure): def __init__(self): super().__init__() self.outputs = None def execute( self, model: Any, input_data: DataManager, dl_framework: DeepLearningFramework, ) -> BenchmarkOriginalModelResult: self.logger.info("Benchmark performance of original model") self.outputs = [ tuple( COMPUTE_OUTPUT_FRAMEWORK[dl_framework]( model, tuple(input_tensors[0]), self.device ) ) for input_tensors in input_data ] inputs = input_data.get_list(QUANTIZATION_DATA_NUM) self.measure_result, _ = COMPUTE_LATENCY_FRAMEWORK[dl_framework]( inputs, model, self.device ) self.logger.info( f"Original model latency: {self.measure_result} sec/iter" ) return BenchmarkOriginalModelResult( latency_seconds=self.measure_result, model_outputs=self.outputs, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/measures/utils.py ================================================ import time from typing import Tuple, List, Union, Any import numpy as np from loguru import logger from nebullvm.config import ONNX_PROVIDERS from nebullvm.core.models import Device, DeviceType from nebullvm.operations.inference_learners.base import BaseInferenceLearner from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch, Module from nebullvm.tools.data import DataManager from nebullvm.tools.onnx import ( convert_to_numpy, get_input_names, get_output_names, ) def compute_torch_latency( xs: List[Tuple[torch.Tensor]], model: Module, device: Device, steps: int = 100, warmup_steps: int = 10, ) -> Tuple[float, List[float]]: """Compute the latency associated with the torch model. Args: xs (List[Tuple[torch.Tensor]]): List of tuples containing the input tensors (a single batch for the model). model (Module): Torch model. device (Device): Device where computing the latency. steps (int, optional): Number of input data to be used to compute the latency of the model. It must be a number <= len(xs). Default: 100. warmup_steps (int, optional): Number of input data to be used to warm up the model. It must be a number <= len(xs). Default: 10. Returns: Float: Average latency. List[Float]: List of latencies obtained. """ if device.type is not DeviceType.TPU: xs = [ tuple(t.to(device.to_torch_format()) for t in tensors) for tensors in xs ] model = model.to(device.to_torch_format()) model.eval() latencies = [] with torch.no_grad(): for i in range(warmup_steps): _ = model.forward(*xs[i]) for i in range(steps): starting_time = time.time() _ = model.forward(*xs[i]) latencies.append(time.time() - starting_time) latency = np.mean(latencies) return latency, latencies def compute_tf_latency( xs: List[Tuple[tf.Tensor]], model: Union[tf.Module, tf.keras.Model], device: Device, steps: int = 100, warmup_steps: int = 10, ) -> Tuple[float, List[float]]: """Compute the latency associated with the tensorflow model. Args: xs (List[Tuple[tf.Tensor]]): List of tuples containing the input tensors (a single batch for the model). model (Module or keras.Model): TF model. device (Device): Device where computing the latency. steps (int, optional): Number of input data to be used to compute the latency of the model. It must be a number <= len(xs). Default: 100. warmup_steps (int, optional): Number of input data to be used to warm up the model. It must be a number <= len(xs). Default: 10. Returns: Float: Average latency. List[Float]: List of latencies obtained. """ latencies = [] with tf.device(device.to_tf_format()): for i in range(warmup_steps): _ = model(xs[i]) for i in range(steps): starting_time = time.time() _ = model(xs[i]) latencies.append(time.time() - starting_time) latency = np.mean(latencies) return latency, latencies def compute_onnx_latency( xs: List[Tuple[np.array]], model: str, device: Device, steps: int = 100, warmup_steps: int = 10, ) -> Tuple[float, List[float]]: """Compute the latency associated with the ONNX model. Args: xs (List[Tuple[np.array]]): List of tuples containing the inputs (a single batch for the model). model (str): ONNX model path. device (Device): Device where computing the latency. steps (int, optional): Number of input data to be used to compute the latency of the model. It must be a number <= len(xs). Default: 100. warmup_steps (int, optional): Number of input data to be used to warm up the model. It must be a number <= len(xs). Default: 10. Returns: Float: Average latency. List[Float]: List of latencies obtained. """ from nebullvm.optional_modules.onnxruntime import onnxruntime as ort input_names = get_input_names(model) output_names = get_output_names(model) if device.type is DeviceType.GPU and len(ONNX_PROVIDERS["cuda"]) == 3: ONNX_PROVIDERS["cuda"][1] = ( "CUDAExecutionProvider", { "device_id": device.idx, }, ) model = ort.InferenceSession( model, providers=ONNX_PROVIDERS["cuda"][1:] if device.type is DeviceType.GPU else ONNX_PROVIDERS["cpu"], ) latencies = [] for i in range(warmup_steps): inputs = {name: array for name, array in zip(input_names, xs[i])} _ = model.run(output_names=output_names, input_feed=inputs) for i in range(steps): inputs = {name: array for name, array in zip(input_names, xs[i])} starting_time = time.time() _ = model.run(output_names=output_names, input_feed=inputs) latencies.append(time.time() - starting_time) latency = np.mean(latencies) return latency, latencies def compute_optimized_running_time( optimized_model: BaseInferenceLearner, input_data: DataManager, steps: int = 100, min_steps: int = 5, warmup_steps: int = 10, ) -> float: """Compute the running time of the optimized model. Args: optimized_model (BaseInferenceLearner): Optimized model. input_data: (DataManager): Dataset used to compute latency. steps (int, optional): Number of input data to be used to compute the latency of the model. Default: 100. min_steps (int, optional): Minimum number of iterations to be performed. Default: 5. warmup_steps (int, optional): Number of input data to be used to warm up the model. Default: 10. Returns: Float: Average latency. """ latencies = [] last_median = None # Warmup inputs_list = input_data.get_split("test").get_list(warmup_steps) for model_inputs in inputs_list: _ = optimized_model(*model_inputs) # Compute latency inputs_list = input_data.get_split("test").get_list(steps) for model_inputs in inputs_list: starting_time = time.time() _ = optimized_model(*model_inputs) latencies.append(time.time() - starting_time) if len(latencies) > min_steps: median = np.median(latencies) diff = ( np.abs(median - last_median) / last_median if last_median is not None else 1.0 ) if diff < 0.05: return median last_median = median return np.median(latencies) def compute_relative_difference( tensor_1: Any, tensor_2: Any, y: Any = None, eps: float = 1e-5, ) -> float: if y is not None: logger.debug( "Received a label for the precision computation. " "It will be ignored." ) tensor_1, tensor_2 = map(convert_to_numpy, (tensor_1, tensor_2)) assert tensor_1.shape == tensor_2.shape, ( "The outputs of the original and optimized models have " "different shapes" ) diff = np.abs(tensor_1 - tensor_2) / ( np.maximum(np.abs(tensor_1), np.abs(tensor_2)) + eps ) return float(np.mean(diff)) def compute_accuracy_drop(tensor_1: Any, tensor_2: Any, y: Any) -> float: assert y is not None, ( "No label found in the dataloader provided. " "To use accuracy metric, you must set also the labels" ) tensor_1, tensor_2, y = map(convert_to_numpy, (tensor_1, tensor_2, y)) accuracy_1 = np.mean(tensor_1.argmax(axis=-1) == y) accuracy_2 = np.mean(tensor_2.argmax(axis=-1) == y) return accuracy_1 - accuracy_2 QUANTIZATION_METRIC_MAP = { "accuracy": compute_accuracy_drop, "numeric_precision": compute_relative_difference, } ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/base.py ================================================ import abc from typing import Any, Dict, List, Optional from nebullvm.core.models import QuantizationType from nebullvm.operations.base import Operation class Compiler(Operation, abc.ABC): supported_ops: Dict[str, List[Optional[QuantizationType]]] def __init__(self): super().__init__() self.compiled_model = None @abc.abstractmethod def execute(self, **kwargs): raise NotImplementedError() @abc.abstractmethod def _compile_model(self, **kwargs) -> Any: raise NotImplementedError() @abc.abstractmethod def _quantize_model(self, **kwargs) -> Any: raise NotImplementedError() def get_result(self) -> Any: return self.compiled_model ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/deepsparse.py ================================================ from pathlib import Path from typing import Union from nebullvm.core.models import ( ModelParams, QuantizationType, ) from nebullvm.operations.conversions.converters import ( PytorchConverter, ) from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.optional_modules.torch import ( Module, GraphModule, ) from nebullvm.tools.data import DataManager class DeepSparseCompiler(Compiler): supported_ops = { "cpu": [None], "gpu": [], } def __init__(self): super().__init__() self.conversion_op = PytorchConverter() def execute( self, model: Module, onnx_output_path: str, model_params: ModelParams, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Compile the input model using DeepSparse Compiler. Args: model (torch.nn.Module): The pytorch model. onnx_output_path (str): Path where the converted ONNX model will be stored. model_params (ModelParams): The model parameters. quantization_type (QuantizationType): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) self.compiled_model = self._compile_model( model, onnx_output_path, input_data, model_params ) def _compile_model( self, model: Union[Module, GraphModule], onnx_output_path: str, input_data: DataManager, model_params: ModelParams, ) -> str: self.conversion_op.model_name = "model_pruned" onnx_pruned_path = Path(onnx_output_path) self.conversion_op.to(self.device).set_state( model, input_data ).execute(onnx_pruned_path, model_params) onnx_pruned_path = str(onnx_pruned_path / "model_pruned.onnx") return onnx_pruned_path @staticmethod def _quantize_model(**kwargs): raise NotImplementedError() ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/__init__.py ================================================ from copy import deepcopy from typing import Union from nebullvm.core.models import QuantizationType, DeviceType from nebullvm.operations.optimizations.compilers.faster_transformer.bert import ( # noqa: E501 detect_and_swap_bert_model, ) from nebullvm.operations.optimizations.compilers.torchscript import ( TorchScriptCompiler, ) from nebullvm.operations.optimizations.compilers.utils import ( get_faster_transformer_repo_path, ) from nebullvm.optional_modules.torch import ( GraphModule, Module, ScriptModule, torch, ) from nebullvm.tools.data import DataManager from nebullvm.tools.huggingface import PyTorchTransformerWrapper default_lib_path = str( get_faster_transformer_repo_path() / "build" / "lib" / "libth_transformer.so" ) def detect_and_swap_model(model, data_type="fp16", remove_padding=False): """currently only supports: - BertModel and model with BertModel as .bert attribute """ model = detect_and_swap_bert_model( model, data_type=data_type, lib_path=default_lib_path, remove_padding=remove_padding, ) if data_type == "fp16": model.half() elif data_type == "bf16": model.bfloat16() return model class FasterTransformerCompiler(TorchScriptCompiler): supported_ops = { "cpu": [None, QuantizationType.STATIC, QuantizationType.DYNAMIC], "gpu": [ None, QuantizationType.HALF, ], } @torch.no_grad() def _compile_model( self, model: Union[Module, GraphModule], input_data: DataManager, quantization_type: QuantizationType, ) -> ScriptModule: model = deepcopy(model) # Some operations modify the model in-place if isinstance(model, PyTorchTransformerWrapper): # .core_model is a huggingface model data_type = ( "fp16" if quantization_type is QuantizationType.HALF else "fp32" ) model.core_model = detect_and_swap_model( model.core_model, data_type=data_type, remove_padding=False ) if self.device.type is DeviceType.GPU: model.cuda() return super()._compile_model(model, input_data, quantization_type) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/__init__.py ================================================ import os from nebullvm.operations.optimizations.compilers.faster_transformer.bert.modeling_bert import ( # noqa: E501 BertModel as FasterBertModel, ) from nebullvm.operations.optimizations.compilers.faster_transformer.bert.modeling_bert import ( # noqa: E501 CustomEncoder, EncoderWeights, ) from nebullvm.operations.optimizations.compilers.utils import ( get_faster_transformer_repo_path, ) from nebullvm.optional_modules.huggingface import BertModel as HFBertModel from nebullvm.optional_modules.torch import torch default_lib_path = str( get_faster_transformer_repo_path() / "build" / "lib" / "libth_transformer.so" ) def swap_bert_encoder(model, data_type, lib_path, remove_padding=False): """ Replace the encoder of the model with a custom encoder that uses the Faster Transformer library. """ weights = EncoderWeights( model.config.num_hidden_layers, model.config.hidden_size, model.state_dict(), ) weights.to_cuda() if data_type == "fp16": weights.to_half() elif data_type == "bf16": weights.to_bfloat16() lib_path = os.path.abspath(lib_path) enc = CustomEncoder( model.config.num_hidden_layers, model.config.num_attention_heads, model.config.hidden_size // model.config.num_attention_heads, weights, remove_padding=remove_padding, path=lib_path, ) enc_ = torch.jit.script(enc) model.replace_encoder(enc_) def swap_model( model: HFBertModel, data_type, lib_path, remove_padding=False ) -> FasterBertModel: # bert model need some custom code to call the custom encoder # so we need to use custom bert class new_model = FasterBertModel(model.config) new_model.load_state_dict(model.state_dict()) swap_bert_encoder(new_model, data_type, lib_path, remove_padding) return new_model def detect_and_swap_bert_model( model, data_type, lib_path=default_lib_path, remove_padding=False ): if type(model) == HFBertModel: model = swap_model(model, data_type, lib_path, remove_padding) if hasattr(model, "bert") and type(model.bert) == HFBertModel: model.bert = swap_model( model.bert, data_type, lib_path, remove_padding ) return model ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/checkpoint_quantization.py ================================================ # Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/checkpoint_quantization.py # noqa: E501 # Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import numpy as np from loguru import logger from nebullvm.optional_modules.torch import torch ACTIVATION_AMAX_NUM = 72 INT8O_GEMM_NUM = 8 TRT_FUSED_MHA_AMAX_NUM = 3 SCALE_RESERVE_NUM = 21 def checkpoint_quantization( init_dict, sparse, ths_path="./lib/libth_transformer.so" ): logger.info("Quantizing checkpoint ...") torch.classes.load_library(ths_path) weight_quantize = torch.ops.fastertransformer.weight_quantize def init_graph(): layer_num = 0 regex = re.compile("layer.\d+") # noqa: W605 amaxTotalNum = 0 for name, tensor_value in init_dict.items(): if "intermediate.dense.weight" in name and amaxTotalNum == 0: amaxTotalNum = ( ACTIVATION_AMAX_NUM + 9 * tensor_value.size(1) + INT8O_GEMM_NUM + TRT_FUSED_MHA_AMAX_NUM + SCALE_RESERVE_NUM ) tmp = regex.findall(name) if len(tmp) < 1: continue num_tmp = int(tmp[0].replace("layer.", "")) if layer_num < num_tmp: layer_num = num_tmp layer_num = layer_num + 1 # add new var for amax for i in range(layer_num): init_dict[ "bert.encoder.layer.{}.amaxList".format(i) ] = torch.zeros((amaxTotalNum,), dtype=torch.float32) return layer_num, amaxTotalNum layer_num, amaxTotalNum = init_graph() kernel_name_list = [ "attention.self.query", "attention.self.key", "attention.self.value", "attention.output.dense", "intermediate.dense", "output.dense", ] amax_name_list = [ "attention.self.query._input_quantizer", "attention.self.query._aftergemm_quantizer", "attention.self.matmul_q_input_quantizer", "attention.self.key._aftergemm_quantizer", "attention.self.matmul_k_input_quantizer", "attention.self.value._aftergemm_quantizer", "attention.self.matmul_v_input_quantizer", "attention.self.softmax_input_quantizer", "attention.self.matmul_a_input_quantizer", "attention.output.dense._input_quantizer", "attention.output.dense._aftergemm_quantizer", "intermediate.dense._input_quantizer", "intermediate.dense._aftergemm_quantizer", "output.dense._input_quantizer", "output.dense._aftergemm_quantizer", "special_F2Bias_scale", ] int8O_gemm_weight_amax_list = [0 for i in range(INT8O_GEMM_NUM)] int8O_gemm_weight_list = [ "attention.self.query", "attention.self.key", "attention.self.value", "attention.self.matmul_k_input_quantizer", "attention.self.matmul_v_input_quantizer", "attention.output.dense", "intermediate.dense", "output.dense", ] int8O_gemm_input_amax_list = [0 for i in range(INT8O_GEMM_NUM)] int8O_gemm_input_list = [ "attention.self.query._input_quantizer", "attention.self.key._input_quantizer", "attention.self.value._input_quantizer", "attention.self.matmul_q_input_quantizer", "attention.self.matmul_a_input_quantizer", "attention.output.dense._input_quantizer", "intermediate.dense._input_quantizer", "output.dense._input_quantizer", ] int8O_gemm_output_amax_list = [0 for i in range(INT8O_GEMM_NUM)] int8O_gemm_output_list = [ "attention.self.query._aftergemm_quantizer", "attention.self.key._aftergemm_quantizer", "attention.self.value._aftergemm_quantizer", "attention.self.softmax_input_quantizer", "attention.output.dense._input_quantizer", "attention.output.dense._aftergemm_quantizer", "intermediate.dense._aftergemm_quantizer", "output.dense._aftergemm_quantizer", ] same_value_tuple_list = [ ( "attention.self.query._input_quantizer", "attention.self.key._input_quantizer", "attention.self.value._input_quantizer", "attention.output.add_residual_input_quantizer", ), ( "intermediate.dense._input_quantizer", "output.add_residual_input_quantizer", ), ] factor = 1000000.0 # noqa: F841 for i in range(layer_num): amaxList = np.zeros([amaxTotalNum]).astype(np.float32) amax_id = 0 # verify some quantizers have same value. # input_quantizer is per-tensor quantization for same_value_tuple in same_value_tuple_list: tmp_v = init_dict[ "bert.encoder.layer.{}.{}._amax".format(i, same_value_tuple[0]) ].numpy() for same_value_name in same_value_tuple: tmp_v_2 = init_dict[ "bert.encoder.layer.{}.{}._amax".format(i, same_value_name) ].numpy() assert np.allclose(tmp_v, tmp_v_2) for amax_name in amax_name_list: if amax_name == "special_F2Bias_scale": if i != layer_num - 1: quant_max = init_dict[ "bert.encoder.layer.{}.{}._amax".format( i + 1, amax_name_list[0] ) ].item() amax = abs(quant_max) else: # not used, placeholder amax = 1.0 amaxList[amax_id] = amax amax_id += 1 amaxList[amax_id] = amax / 127.0 amax_id += 1 amaxList[amax_id] = amax / 127.0 / 127.0 amax_id += 1 amaxList[amax_id] = 127.0 / amax amax_id += 1 continue quant_max = init_dict[ "bert.encoder.layer.{}.{}._amax".format(i, amax_name) ].item() amax = abs(quant_max) # round(abs(quant_max)*factor)/factor if amax_name in int8O_gemm_input_list: int8O_gemm_input_amax_list[ int8O_gemm_input_list.index(amax_name) ] = amax if amax_name == "attention.self.query._input_quantizer": int8O_gemm_input_amax_list[ int8O_gemm_input_list.index( "attention.self.key._input_quantizer" ) ] = amax int8O_gemm_input_amax_list[ int8O_gemm_input_list.index( "attention.self.value._input_quantizer" ) ] = amax if amax_name in int8O_gemm_output_list: int8O_gemm_output_amax_list[ int8O_gemm_output_list.index(amax_name) ] = amax if amax_name in int8O_gemm_weight_list: int8O_gemm_weight_amax_list[ int8O_gemm_weight_list.index(amax_name) ] = amax amaxList[amax_id] = amax amax_id += 1 amaxList[amax_id] = amax / 127.0 amax_id += 1 amaxList[amax_id] = amax / 127.0 / 127.0 amax_id += 1 amaxList[amax_id] = 127.0 / amax amax_id += 1 # kernel amax starts from ACTIVATION_AMAX_NUM assert amax_id == 64 amax_id = ACTIVATION_AMAX_NUM for kernel_id, kernel_name in enumerate(kernel_name_list): kernel = ( init_dict[ "bert.encoder.layer.{}.{}.weight".format(i, kernel_name) ] .transpose(-1, -2) .contiguous() ) quant_max2 = init_dict[ "bert.encoder.layer.{}.{}._weight_quantizer._amax".format( i, kernel_name ) ] amax2 = abs(quant_max2) if amax2.dim() == 0: quant_max_processed = torch.full( (kernel.size(1),), amax2.item(), dtype=amax2.dtype, device=amax2.device, ) else: quant_max_processed = amax2.view(-1) kernel_processed = weight_quantize( kernel, quant_max_processed.cuda(), sparse ) init_dict[ "bert.encoder.layer.{}.{}.weight".format(i, kernel_name) ] = kernel_processed if kernel_name in int8O_gemm_weight_list: int8O_gemm_weight_amax_list[ int8O_gemm_weight_list.index(kernel_name) ] = quant_max_processed[0] for e in quant_max_processed: amaxList[amax_id] = e amax_id += 1 # for int8O gemm deQuant for j in range(INT8O_GEMM_NUM): amaxList[amax_id] = ( int8O_gemm_input_amax_list[j] * int8O_gemm_weight_amax_list[j] ) / (127.0 * int8O_gemm_output_amax_list[j]) amax_id += 1 # for trt fused MHA amax # QKV_addBias_amax amaxList[amax_id] = np.maximum( np.maximum(amaxList[8], amaxList[16]), amaxList[24] ) amax_id += 1 # softmax amax amaxList[amax_id] = amaxList[32] amax_id += 1 # bmm2 amax amaxList[amax_id] = amaxList[36] amax_id += 1 init_dict["bert.encoder.layer.{}.amaxList".format(i)] = torch.tensor( amaxList, dtype=torch.float32 ) logger.info("Quantizing checkpoint done.") return init_dict ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/modeling_bert.py ================================================ # Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/modeling_bert.py # noqa: E501 # This file is mostly copied from the FasterTransformer repo # https://github.com/NVIDIA/FasterTransformer # Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Optional from loguru import logger from nebullvm.optional_modules.torch import torch, torch_distributed as dist from nebullvm.optional_modules.huggingface import ( BertConfig, BertEmbeddings, BertEncoder, BertPooler, BertPreTrainedModel, ) from .checkpoint_quantization import checkpoint_quantization class EncoderWeights(object): def __init__( self, layer_num, hidden_dim, weights=None, sparse=False, tensor_para_size=1, pipeline_para_size=1, ): """weights need be a state_dict of bert model""" self.layer_num = layer_num self.int8 = False self.hidden_dim = hidden_dim self.weights = {} self.tensor_para_size = tensor_para_size self.pipeline_para_size = pipeline_para_size self.use_mpi = dist.is_mpi_available() if self.use_mpi: try: dist.init_process_group(backend="mpi") except: # noqa: E722 logger.info( "[INFO] WARNING: Exception occurred in " "dist.init_process_group(backend='mpi')." "Maybe the process group has been initialized somewhere else." # noqa: E501 ) else: logger.info("[INFO] MPI is not available in this PyTorch build.") assert ( tensor_para_size == 1 ), "[FATAL] MPI is required for tensor_para_size > 1." assert ( pipeline_para_size == 1 ), "[FATAL] MPI is required for pipeline_para_size > 1." self.rank = dist.get_rank() if self.use_mpi else 0 self.device_count = torch.cuda.device_count() self.device = self.rank % self.device_count torch.cuda.set_device(self.device) world_size = dist.get_world_size() if self.use_mpi else 1 # noqa: F841 self.tensor_para_rank = self.rank % self.tensor_para_size self.pipeline_para_rank = self.rank // self.tensor_para_size if weights is None: self._generated_weights = True for i in range(layer_num): pre = "encoder.layer." + str(i) + "." self.weights[ pre + "attention.self.query.weight" ] = torch.zeros(hidden_dim, hidden_dim) self.weights[pre + "attention.self.query.bias"] = torch.zeros( hidden_dim ) self.weights[pre + "attention.self.key.weight"] = torch.zeros( hidden_dim, hidden_dim ) self.weights[pre + "attention.self.key.bias"] = torch.zeros( hidden_dim ) self.weights[ pre + "attention.self.value.weight" ] = torch.zeros(hidden_dim, hidden_dim) self.weights[pre + "attention.self.value.bias"] = torch.zeros( hidden_dim ) self.weights[ pre + "attention.output.dense.weight" ] = torch.zeros(hidden_dim, hidden_dim) self.weights[ pre + "attention.output.dense.bias" ] = torch.zeros(hidden_dim) self.weights[ pre + "attention.output.LayerNorm.weight" ] = torch.zeros(hidden_dim) self.weights[ pre + "attention.output.LayerNorm.bias" ] = torch.zeros(hidden_dim) self.weights[pre + "intermediate.dense.weight"] = torch.zeros( 4 * hidden_dim, hidden_dim ) # noqa: E501 self.weights[pre + "intermediate.dense.bias"] = torch.zeros( 4 * hidden_dim ) self.weights[pre + "output.dense.weight"] = torch.zeros( hidden_dim, 4 * hidden_dim ) self.weights[pre + "output.dense.bias"] = torch.zeros( hidden_dim ) self.weights[pre + "output.LayerNorm.weight"] = torch.zeros( hidden_dim ) self.weights[pre + "output.LayerNorm.bias"] = torch.zeros( hidden_dim ) for k, v in self.weights.items(): if not k.endswith("_amax"): self.weights[k] = torch.nn.init.uniform_(v, -1, 1) if sparse: for k, v in self.weights.items(): if ( "query.weight" in k or "key.weight" in k or "value.weight" in k or "dense.weight" in k ): v_shape = v.shape v = v.view(-1, 4) _, indices = torch.topk( torch.abs(v), 2, dim=-1, largest=False ) v.scatter_(1, indices, 0) self.weights[k] = v.view(v_shape) else: self._generated_weights = False for k, v in weights.items(): ks = k.split(".") if ks[-2] == "LayerNorm": if ks[-1] == "gamma": ks[-1] = "weight" elif ks[-1] == "beta": ks[-1] = "bias" self.weights[".".join(ks)] = v def listed_weights(self): ret = [] start_layer = ( self.pipeline_para_rank * self.layer_num // self.pipeline_para_size ) end_layer = ( (self.pipeline_para_rank + 1) * self.layer_num // self.pipeline_para_size ) if not self.int8: ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.query.weight" ].transpose(-1, -2) for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) # 0 ret[-1] = ( ret[-1] .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.query.bias" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret[-1] = ( ret[-1] .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.key.weight" ].transpose(-1, -2) for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) # 2 ret[-1] = ( ret[-1] .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.key.bias" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret[-1] = ( ret[-1] .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.value.weight" ].transpose(-1, -2) for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) # 4 ret[-1] = ( ret[-1] .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.value.bias" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret[-1] = ( ret[-1] .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.output.dense.weight" ].transpose(-1, -2) for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) # 6 ret[-1] = ( ret[-1] .split(ret[-1].shape[1] // self.tensor_para_size, dim=1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.output.dense.bias" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.output.LayerNorm.weight" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.output.LayerNorm.bias" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "intermediate.dense.weight" ].transpose(-1, -2) for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) # 10 ret[-1] = ( ret[-1] .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "intermediate.dense.bias" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret[-1] = ( ret[-1] .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "output.dense.weight" ].transpose(-1, -2) for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) # 12 ret[-1] = ( ret[-1] .split(ret[-1].shape[1] // self.tensor_para_size, dim=1)[ self.tensor_para_rank ] .contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "output.dense.bias" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "output.LayerNorm.weight" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "output.LayerNorm.bias" ] for layer_idx in range(start_layer, end_layer) ], 0, ).contiguous() ) else: ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.query.weight" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) # 0 ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.query.bias" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.key.weight" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) # 2 ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.key.bias" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.value.weight" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) # 4 ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.self.value.bias" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.output.dense.weight" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) # 6 ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.output.dense.bias" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.output.LayerNorm.weight" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "attention.output.LayerNorm.bias" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "intermediate.dense.weight" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) # 10 ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "intermediate.dense.bias" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "output.dense.weight" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) # 12 ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "output.dense.bias" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "output.LayerNorm.weight" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "output.LayerNorm.bias" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "amaxList" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) ret.append( torch.stack( [ self.weights[ "encoder.layer." + str(layer_idx) + "." + "h_amaxList" ] for layer_idx in range(self.layer_num) ], 0, ).contiguous() ) return ret def to_cuda(self): if not self.int8: for k, v in self.weights.items(): self.weights[k] = v.cuda() else: h_scale_list = {} for k, v in self.weights.items(): if "amaxList" in k: k_h = k.replace("amaxList", "h_amaxList") h_scale_list[k_h] = v self.weights[k] = v.cuda() for k, v in h_scale_list.items(): self.weights[k] = v def to_half(self): if self.int8: raise RuntimeError( "Cannot cast to half if the weights have been casted to int8." ) for k, v in self.weights.items(): self.weights[k] = v.half() def to_bfloat16(self): if self.int8: raise RuntimeError( "Cannot cast to bfloat16 if the weights have been casted to int8." # noqa: E501 ) for k, v in self.weights.items(): self.weights[k] = v.bfloat16() def to_int8(self, sparse=False, ths_path="./lib/libth_transformer.so"): if self._generated_weights: amax_tensor_1 = torch.Tensor(self.hidden_dim).fill_(127.0) amax_tensor_2 = torch.Tensor(self.hidden_dim * 4).fill_(127.0) for i in range(self.layer_num): pre = "encoder.layer." + str(i) + "." self.weights[ pre + "attention.self.query._input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.query._weight_quantizer._amax" ] = amax_tensor_1 self.weights[ pre + "attention.self.query._aftergemm_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.key._input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.key._weight_quantizer._amax" ] = amax_tensor_1 self.weights[ pre + "attention.self.key._aftergemm_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.value._input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.value._weight_quantizer._amax" ] = amax_tensor_1 self.weights[ pre + "attention.self.value._aftergemm_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.matmul_q_input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.matmul_k_input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.matmul_v_input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.matmul_a_input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.self.softmax_input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.output.dense._input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.output.dense._weight_quantizer._amax" ] = amax_tensor_1 self.weights[ pre + "attention.output.dense._aftergemm_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.output.add_local_input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "attention.output.add_residual_input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "intermediate.dense._input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "intermediate.dense._weight_quantizer._amax" ] = amax_tensor_2 self.weights[ pre + "intermediate.dense._aftergemm_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "output.dense._input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "output.dense._weight_quantizer._amax" ] = amax_tensor_1 self.weights[ pre + "output.dense._aftergemm_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "output.add_local_input_quantizer._amax" ] = torch.tensor(127.0) self.weights[ pre + "output.add_residual_input_quantizer._amax" ] = torch.tensor(127.0) if ( "encoder.layer.0.attention.self.query._input_quantizer._amax" not in self.weights ): raise RuntimeError( "There is no quantization node in the checkpoint, cannot be quantized to int8." # noqa: E501 ) if self.int8: return self.int8 = True for k, v in self.weights.items(): if k.endswith("bias") or k.endswith("LayerNorm.weight"): self.weights[k] = v.half() elif k.endswith("weight"): self.weights[k] = v.float().cuda() else: self.weights[k] = v.float().cpu() self.weights = checkpoint_quantization( self.weights, sparse, ths_path, verbose=False ) class CustomEncoder(torch.nn.Module): def __init__( self, layer_num, head_num, head_size, weights, int8_mode=0, remove_padding=False, sparse=False, path="./lib/libth_transformer.so", tensor_para_size=1, pipeline_para_size=1, ): super().__init__() self.layer_num = layer_num self.remove_padding = remove_padding self.int8_mode = int8_mode logger.info(f"loading faster transformer library from {path}") torch.classes.load_library(path) weights_ = weights.listed_weights() self.use_mpi = dist.is_mpi_available() if self.use_mpi: try: dist.init_process_group(backend="mpi") except: # noqa: E722 logger.info( "[INFO] WARNING: Exception occurred in" "dist.init_process_group(backend='mpi')." "Maybe the process group has been initialized somewhere else." # noqa: E501 ) else: logger.info("[INFO] MPI is not available in this PyTorch build.") assert ( tensor_para_size == 1 ), "[FATAL] MPI is required for tensor_para_size > 1." assert ( pipeline_para_size == 1 ), "[FATAL] MPI is required for pipeline_para_size > 1." if int8_mode == 0: assert len(weights_) == 16 try: self.encoders = torch.classes.FasterTransformer.Bert( *weights_, head_num, head_size, 4 * head_num * head_size, remove_padding, layer_num, sparse, 1.0, tensor_para_size, pipeline_para_size, ) except: # noqa: E722 # legacy ths for 20.03 image self.encoders = torch.classes.FasterTransformerBert( *weights_, head_num, head_size, 4 * head_num * head_size, remove_padding, layer_num, sparse, 1.0, tensor_para_size, pipeline_para_size, ) else: assert len(weights_) == 18 assert ( tensor_para_size == 1 ), "INT8 BERT still only support tensor_para_size = 1" assert ( pipeline_para_size == 1 ), "INT8 BERT still only support pipeline_para_size = 1" try: self.encoders = torch.classes.FasterTransformer.INT8Bert( *weights_, head_num, head_size, remove_padding, layer_num, int8_mode, sparse, 1.0, ) except: # noqa: E722 # legacy ths for 20.03 image self.encoders = torch.classes.FasterTransformerINT8Bert( *weights_, head_num, head_size, remove_padding, layer_num, int8_mode, sparse, 1.0, ) def forward(self, hidden_states, attention_mask, sequence_lengths): hidden_states = self.encoders.forward(hidden_states, sequence_lengths) return (hidden_states,) class HuggingFaceEncoder(torch.nn.Module): def __init__(self, layer_num, head_num, head_size, weights=None): super().__init__() hidden_dim = head_num * head_size # TODO(bhsueh) The implementation of hidden_act='gelu' is differen # to FT's (and google BERT) implementation # FT's implementation is equivalent to hidden_act='gelu_new', # but there are some issues for int8 sparse under gelu_new conf = BertConfig( hidden_size=hidden_dim, intermediate_size=4 * hidden_dim, num_attention_heads=head_num, num_hidden_layers=layer_num, hidden_act="gelu", ) self.encoder = BertEncoder(conf) w = {} for k, v in weights.weights.items(): if k.startswith("encoder") and not k.endswith("_amax"): w[k[13:]] = weights.weights[k] self.encoder.load_state_dict(w) self.head_mask = [None] * layer_num def forward(self, hidden_states, attention_mask): extended_attention_mask = (1.0 - attention_mask) * -10000.0 output = self.encoder( hidden_states, extended_attention_mask, self.head_mask, return_dict=False, ) return output # Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/modeling_bert.py # noqa: E501 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # noqa: E501 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch BERT model modified from HuggingFace transformers. """ class BertModel(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights() self.use_ext_encoder = False def forward( self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" # noqa: E501 ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds" ) device = ( input_ids.device if input_ids is not None else inputs_embeds.device ) if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros( input_shape, dtype=torch.long, device=device ) if self.use_ext_encoder: # if attention_mask.dim() == 3: # extended_attention_mask = attention_mask # elif attention_mask.dim() == 2: # extended_attention_mask = attention_mask[:, None, :].repeat(1, input_shape[1], 1) # noqa: E501 # else: # raise ValueError( # "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(# noqa: E501 # input_shape, attention_mask.shape # ) # ) assert attention_mask.dim() == 2 extended_attention_mask = attention_mask.view( -1, 1, 1, attention_mask.size(-1) ) m_2 = extended_attention_mask.transpose(-1, -2) extended_attention_mask = extended_attention_mask * m_2 extended_attention_mask = extended_attention_mask.to( dtype=next(self.parameters()).dtype ) # fp16 compatibility seq_lens = torch.sum(attention_mask, 1, dtype=torch.int32).cuda() else: # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # noqa: E501 # ourselves in which case we just need to make it broadcastable to all heads. # noqa: E501 if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( # noqa: E501 input_shape, attention_mask.shape ) ) # Since attention_mask is 1.0 for positions we want to attend # and 0.0 for masked positions, this operation will create a # tensor which is 0.0 for positions we want to attend # and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, # this is effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to( dtype=next(self.parameters()).dtype ) # fp16 compatibility extended_attention_mask = ( 1.0 - extended_attention_mask ) * -10000.0 embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) if self.use_ext_encoder: encoder_outputs = self.encoder( embedding_output, extended_attention_mask, seq_lens ) else: head_mask = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) # noqa: E501 def replace_encoder(self, new_encoder): self.encoder = new_encoder self.use_ext_encoder = True ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/__init__.py ================================================ # Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/gpt_summarization.py # noqa: E501 # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import tempfile from typing import Callable, Iterable, List, Optional, Tuple, Union from nebullvm.operations.optimizations.compilers.faster_transformer.gpt.utils import \ gpt_decoder from nebullvm.operations.optimizations.compilers.faster_transformer.gpt.utils.huggingface_gpt_convert import ( # noqa: E501 main as convert_huggingface_gpt_to_faster_transformer, ) from nebullvm.operations.optimizations.compilers.utils import ( get_faster_transformer_repo_path, ) from nebullvm.optional_modules.huggingface import GPT2LMHeadModel from nebullvm.optional_modules.torch import torch lib_path = default_lib_path = str( get_faster_transformer_repo_path() / "build" / "lib" / "libth_transformer.so" ) class FasterTransformerGPT2Wrapper(torch.nn.Module): def __init__(self, model: gpt_decoder.Gpt, config): super().__init__() self.model = model self.config = config self.device = model.device @torch.no_grad() def generate( self, inputs: Optional[torch.Tensor] = None, max_length: Optional[int] = None, min_length: Optional[int] = None, do_sample: Optional[bool] = None, early_stopping: Optional[bool] = None, num_beams: Optional[int] = 1, temperature: Optional[float] = None, penalty_alpha: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, typical_p: Optional[float] = None, repetition_penalty: Optional[float] = None, bad_words_ids: Optional[Iterable[int]] = None, force_words_ids: Optional[ Union[Iterable[int], Iterable[Iterable[int]]] ] = None, bos_token_id: Optional[int] = None, pad_token_id: Optional[int] = None, eos_token_id: Optional[int] = None, length_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, encoder_no_repeat_ngram_size: Optional[int] = None, num_return_sequences: Optional[int] = None, max_time: Optional[float] = None, max_new_tokens: Optional[int] = None, decoder_start_token_id: Optional[int] = None, use_cache: Optional[bool] = None, num_beam_groups: Optional[int] = None, diversity_penalty: Optional[float] = None, prefix_allowed_tokens_fn: Optional[ Callable[[int, torch.Tensor], List[int]] ] = None, # logits_processor: Optional[LogitsProcessorList] = None, # renormalize_logits: Optional[bool] = None, # stopping_criteria: Optional[StoppingCriteriaList] = None, # constraints: Optional[List[Constraint]] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, forced_bos_token_id: Optional[int] = None, forced_eos_token_id: Optional[int] = None, remove_invalid_values: Optional[bool] = None, synced_gpus: Optional[bool] = False, exponential_decay_length_penalty: Optional[Tuple[int, float]] = None, suppress_tokens: Optional[List[int]] = None, begin_suppress_tokens: Optional[List[int]] = None, forced_decoder_ids: Optional[List[List[int]]] = None, ): input_lengths = torch.tensor( [len(input) for input in inputs], dtype=torch.int32, device=self.model.device, ) batch_size = len(inputs) def convert_to_tensor_if_not(value, dtype=torch.float32): if value is None: return value if isinstance(value, torch.Tensor): return value return value * torch.ones(batch_size, dtype=dtype) # cpu tensor top_k = convert_to_tensor_if_not(top_k, dtype=torch.int32) top_p = convert_to_tensor_if_not(top_p, dtype=torch.float32) temperature = convert_to_tensor_if_not( temperature, dtype=torch.float32 ) repetition_penalty = convert_to_tensor_if_not( repetition_penalty, dtype=torch.float32 ) min_length = convert_to_tensor_if_not(min_length, dtype=torch.int32) len_penalty = convert_to_tensor_if_not( length_penalty, dtype=torch.float32 ) if max_length is None: # gen_length is required for faster transformer # infer it from the model config max_length = self.config.n_ctx output_dict = self.model.generate( input_token_ids=inputs, input_lengths=input_lengths, gen_length=max_length - len(inputs[0]), eos_token_id=eos_token_id, # local_batch_size=None, beam_width=num_beams, top_k=top_k, top_p=top_p, # top_p_decay: Optional[torch.FloatTensor] = None, # top_p_min: Optional[torch.FloatTensor] = None, # top_p_reset_ids: Optional[torch.IntTensor] = None, temperature=temperature, repetition_penalty=repetition_penalty, # presence_penalty: Optional[torch.FloatTensor] = None, min_length=min_length, len_penalty=len_penalty, # beam_search_diversity_rate: Optional[torch.FloatTensor] = None, # stop_words_list: Optional[torch.IntTensor] = None, # bad_words_list: Optional[torch.IntTensor] = None, # sequence_limit_lengths: Optional[torch.IntTensor] = None, # random_seed: Optional[torch.LongTensor] = None, # memory_length: Optional[int] = None, return_output_length=True, return_log_probs=False, ) output_token_ids = output_dict["output_token_ids"] output_lengths = output_dict["output_lengths"] # tokens = output_token_ids[0, 0, input_lengths[0]:output_lengths[0]] tokens = [ # output_token_ids[i, 0, input_lengths[i]:output_lengths[i]] output_token_ids[i, 0, : output_lengths[i]] for i in range(batch_size) ] return tokens def convert_gpt2_lm_head_model( model: GPT2LMHeadModel, tokenizer, weight_data_type="fp32", data_type="fp16", use_fp32_to_compute_logit=False, ): """ currently doens't support fp8 or multi-gpu """ weights_data_type = weight_data_type temp_dir = tempfile.TemporaryDirectory() temp_dir_path = temp_dir.name ft_model_location = saved_dir = temp_dir_path + "/gpt2" hf_config = model.config.to_dict() # convert huggingface model to faster transformer model convert_huggingface_gpt_to_faster_transformer( saved_dir=saved_dir, model=model.transformer, weight_data_type=weight_data_type, ) head_num = hf_config["n_head"] layer_num = hf_config["n_layer"] start_id = hf_config["bos_token_id"] end_id = hf_config["eos_token_id"] size_per_head = hf_config["n_embd"] // head_num vocab_size = tokenizer.vocab_size tensor_para_size = 1 pipeline_para_size = 1 ckpt_path = os.path.join(ft_model_location, f"{tensor_para_size}-gpu") max_seq_len = hf_config["n_ctx"] int8_mode = 0 # 0: no quantization, 1: quantize weights to int8 # load faster transformer model, note that the lm_head is not saved # it's reconstructed during loading from the embedding weights gpt = gpt_decoder.Gpt( num_heads=head_num, size_per_head=size_per_head, num_layers=layer_num, vocab_size=vocab_size, start_id=start_id, end_id=end_id, tensor_para_size=tensor_para_size, pipeline_para_size=pipeline_para_size, lib_path=lib_path, max_seq_len=max_seq_len, int8_mode=int8_mode, inference_data_type=data_type, weights_data_type=weights_data_type, use_fp32_to_compute_logit=use_fp32_to_compute_logit, ) gpt.load(ckpt_path, data_type) return FasterTransformerGPT2Wrapper(gpt, model.config) # from transformers import GPT2LMHeadModel, GPT2Tokenizer # tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # tokenizer.pad_token = tokenizer.eos_token # model = hf_model = GPT2LMHeadModel.from_pretrained("gpt2").to("cuda").eval() # hf_config = hf_model.config.to_dict() # model = GPT2LMHeadModel.from_pretrained("gpt2") # tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # weight_data_type = weights_data_type = "fp32" # fp32 or fp16 # data_type = "fp32" # fp32 or fp16 # faster_model= convert_gpt2_lm_head_model( # model, tokenizer, # weight_data_type=weight_data_type, # data_type=data_type) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/gpt_decoder.py ================================================ # Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/utils/gpt_decoder.py # noqa: E501 # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math from abc import abstractmethod from pathlib import Path from typing import List, Literal, Optional, Union import os import numpy as np from . import comm from . import profiler from .gpt import GptInitModelParameters from nebullvm.optional_modules.torch import torch PathLike = Union[str, Path] def to_numpy_dtype(maybe_str_dtype: Union[str, np.dtype]): assert isinstance(maybe_str_dtype, (str, np.dtype)) if isinstance(maybe_str_dtype, str): try: dtype = { "fp16": np.float16, "float16": np.float16, "fp32": np.float32, "float32": np.float32, }[maybe_str_dtype] except KeyError: raise ValueError( f"Cannot convert to numpy data type, got {maybe_str_dtype}" ) else: dtype = maybe_str_dtype return dtype def to_torch_dtype(maybe_str_dtype: Union[str, torch.dtype]): if isinstance(maybe_str_dtype, torch.dtype): dtype = maybe_str_dtype else: try: dtype = { "bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32, }[maybe_str_dtype] except KeyError: raise ValueError( f"Cannot convert to torch data type, got {maybe_str_dtype}" ) return dtype def load_weight_from_bin( checkpoint_path: PathLike, shape: List[int], weight_dtype: Union[str, np.dtype], ): """Load a weight from a bin file. # Args. checkpoint_path: str or Path, a checkpoint file path of an FT's layer weight. shape: list of int, the shape of weight tensor. weight_dtype: str or np.dtype, the data type of the stored weight. """ weight_dtype = to_numpy_dtype(weight_dtype) return torch.from_numpy(np.fromfile(checkpoint_path, dtype=weight_dtype)) LayernormType = Literal["pre_layernorm", "post_layernorm"] class GptLayerWeights: def __init__( self, num_heads: int, size_per_head: int, inter_size: int, num_layers: int, tensor_para_size: int = 1, pipeline_para_size: int = 1, has_adapters: bool = False, adapter_inter_size: int = 0, int8_mode: int = 0, ): assert num_heads % tensor_para_size == 0, ( f"num_heads ({num_heads}) is not multiple of " "tensor para size ({tensor_para_size})" ) self.num_heads = num_heads self.size_per_head = size_per_head self.hidden_units = num_heads * size_per_head self.num_layers = num_layers self.tensor_para_size = tensor_para_size self.tensor_para_rank = comm.get_tensor_para_rank() self.pipeline_para_size = pipeline_para_size self.pipeline_para_rank = comm.get_pipeline_para_rank() self.has_adapters = has_adapters self.adapter_inter_size = adapter_inter_size self.local_num_layers = num_layers // pipeline_para_size self.local_num_heads = num_heads // tensor_para_size self.local_hidden_units = self.local_num_heads * size_per_head self.local_inter_size = inter_size // tensor_para_size self.local_adapter_inter_size = ( self.adapter_inter_size // tensor_para_size ) self.weight_transpose_calibrate_quantize = None assert int8_mode in [0, 1], "Invalid int8 mode for GPT. Must be 0 or 1" self.int8_mode = int8_mode if self.int8_mode == 1: quant = ( torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix # noqa: E501 ) self.weight_transpose_calibrate_quantize = lambda x: quant( x, torch.int8 ) self.weights = None self.int8_weights = None self.int8_scales = None self.expected_weight_shapes = list() # pylint:disable=line-too-long # Transformer blocks self.expected_weight_shapes.extend( [(self.hidden_units,)] * self.local_num_layers ) # input layernorm weight self.expected_weight_shapes.extend( [(self.hidden_units,)] * self.local_num_layers ) # input layernorm bias self.expected_weight_shapes.extend( [(self.hidden_units, self.local_hidden_units * 3)] * self.local_num_layers ) # attention qkv weight self.expected_weight_shapes.extend( [(self.local_hidden_units * 3,)] * self.local_num_layers ) # attention qkv bias self.expected_weight_shapes.extend( [(self.local_hidden_units, self.hidden_units)] * self.local_num_layers ) # attention dense weight self.expected_weight_shapes.extend( [(self.hidden_units,)] * self.local_num_layers ) # attention dense bias self.expected_weight_shapes.extend( [(self.hidden_units,)] * self.local_num_layers ) # post attention layernorm weight self.expected_weight_shapes.extend( [(self.hidden_units,)] * self.local_num_layers ) # post attention layernorm bias self.expected_weight_shapes.extend( [(self.hidden_units, self.local_inter_size)] * self.local_num_layers ) # ffn_kernel1 self.expected_weight_shapes.extend( [(self.local_inter_size,)] * self.local_num_layers ) # ffn_bias1 self.expected_weight_shapes.extend( [(self.local_inter_size, self.hidden_units)] * self.local_num_layers ) # ffn_kernel2 self.expected_weight_shapes.extend( [(self.hidden_units,)] * self.local_num_layers ) # ffn_bias2 # Adapters if self.has_adapters: self.expected_weight_shapes.extend( [(self.hidden_units, self.local_adapter_inter_size)] * self.local_num_layers ) # adaptor1_kernel1 self.expected_weight_shapes.extend( [(self.local_adapter_inter_size,)] * self.local_num_layers ) # adaptor1_bias1 self.expected_weight_shapes.extend( [(self.local_adapter_inter_size, self.hidden_units)] * self.local_num_layers ) # adaptor1_kernel2 self.expected_weight_shapes.extend( [(self.hidden_units,)] * self.local_num_layers ) # adaptor1_bias2 self.expected_weight_shapes.extend( [(self.hidden_units, self.local_adapter_inter_size)] * self.local_num_layers ) # adaptor2_kernel1 self.expected_weight_shapes.extend( [(self.local_adapter_inter_size,)] * self.local_num_layers ) # adaptor2_bias1 self.expected_weight_shapes.extend( [(self.local_adapter_inter_size, self.hidden_units)] * self.local_num_layers ) # adaptor2_kernel2 self.expected_weight_shapes.extend( [(self.hidden_units,)] * self.local_num_layers ) # adaptor2_bias2 # pylint:enable=line-too-long @classmethod def from_config(cls, config: GptInitModelParameters): return cls( num_heads=config.head_num, size_per_head=config.size_per_head, inter_size=4 * config.head_num * config.size_per_head, num_layers=config.layer_num, tensor_para_size=config.tensor_para_size, pipeline_para_size=config.pipeline_para_size, has_adapters=config.has_adapters, adapter_inter_size=config.adapter_inter_size, int8_mode=config.int8_mode, ) @property def dtype(self): return self.weights[0].dtype @property def device(self): return self.weights[0].device def _map(self, func): for i in range(len(self.weights)): if isinstance(self.weights[i], list): for j in range(len(self.weights[i])): self.weights[i][j] = func(self.weights[i][j]) else: self.weights[i] = func(self.weights[i]) def _map_int8(self, func): for i in range(len(self.int8_weights)): if isinstance(self.int8_weights[i], list): for j in range(len(self.int8_weights[i])): self.int8_weights[i][j] = func(self.int8_weights[i][j]) else: self.int8_weights[i] = func(self.int8_weights[i]) for i in range(len(self.int8_scales)): if isinstance(self.int8_scales[i], list): for j in range(len(self.int8_scales[i])): self.int8_scales[i][j] = func(self.int8_scales[i][j]) else: self.int8_scales[i] = func(self.int8_scales[i]) def float(self): if self.dtype == torch.float32: return self._map(lambda x: x.float()) def half(self): if self.dtype == torch.float16: return self._map(lambda x: x.half()) if self.int8_mode == 1: self._map_int8(lambda w: w.half()) def bfloat16(self): if self.dtype == torch.bfloat16: return self._map(lambda x: x.bfloat16()) if self.int8_mode == 1: self._map_int8(lambda w: w.bfloat16()) def cuda(self, device=None): self._map(lambda x: x.cuda(device)) if self.int8_mode == 1: self._map_int8(lambda x: x.cuda(device)) def to(self, device=None): self._map(lambda x: x.to(device)) if self.int8_mode == 1: self._map_int8(lambda x: x.to(device)) def is_valid_pp_group(self, layer, pp_rank): return layer // self.layers_per_device == pp_rank def load( self, checkpoint_path: PathLike, compute_dtype: torch.dtype, weight_dtype: Optional[Union[str, np.dtype]] = None, device: Optional[Union[int, str, torch.device]] = None, ): """Load checkpoint weights. # Args. checkpoint_path: str or Path, a checkpoint directory where FT checkpoint files locate. weight_dtype: str or np.dtype, the data type of stored weights. """ checkpoint_path = Path(checkpoint_path) if not checkpoint_path.exists(): raise FileNotFoundError( f"Could not find checkpoint {str(checkpoint_path)}" ) weight_dtype = to_numpy_dtype(weight_dtype) print( f"Load weights from {str(checkpoint_path)} (data type: {weight_dtype}" # noqa: E501 ) self.weights = list() self.int8_weights = list() self.int8_scales = list() torch.cuda.empty_cache() def _load_from_file(fname): quant_sub_names = [ "attention.query_key_value.weight", "attention.dense.weight", "dense_h_to_4h.weight", "dense_4h_to_h.weight", ] _weight = torch.from_numpy( np.fromfile(checkpoint_path / fname, dtype=weight_dtype) ) _weight = _weight.to(compute_dtype) weight_index = len(self.weights) expected_shape = self.expected_weight_shapes[weight_index] try: if _weight.nelement() > 0: _weight = _weight.reshape(expected_shape) except: # noqa: E722 raise ValueError( f"num_heads, size_per_head, vocab_size, and max_seq_len must be the same " # noqa: E501 f"as the ones during training (weight: {fname} expected shape: {expected_shape}, " # noqa: E501 f"got shape: {_weight.shape})." ) should_quantize = any( sub_name in fname for sub_name in quant_sub_names ) if self.int8_mode != 0 and should_quantize: calibrate = self.weight_transpose_calibrate_quantize int8_weight, int8_scales = calibrate(_weight) # int8 weights should appear in same order as FP weights. # Move to device and add to the int8 list. dummy_weight = torch.empty(0, dtype=compute_dtype) if device is not None: int8_weight = int8_weight.to(device) int8_scales = int8_scales.to(device) dummy_weight = dummy_weight.to(device) self.int8_weights.append(int8_weight) self.int8_scales.append(int8_scales) self.weights.append(dummy_weight) else: if device is not None: _weight = _weight.to(device) self.weights.append(_weight) # Load # pylint:disable=line-too-long layer_offset = self.local_num_layers * self.pipeline_para_rank [ _load_from_file( f"model.layers.{layer_offset + i}.input_layernorm.weight.bin" ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.input_layernorm.bias.bin" ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.attention.query_key_value.weight.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.attention.query_key_value.bias.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.attention.dense.weight.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.attention.dense.bias.bin" ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.post_attention_layernorm.weight.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.post_attention_layernorm.bias.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.mlp.dense_h_to_4h.weight.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.mlp.dense_h_to_4h.bias.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.mlp.dense_4h_to_h.weight.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.mlp.dense_4h_to_h.bias.bin" ) for i in range(self.local_num_layers) ] if self.has_adapters: [ _load_from_file( f"model.layers.{layer_offset + i}.after_attention_adapter.dense_h_to_4h.weight.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.after_attention_adapter.dense_h_to_4h.bias.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.after_attention_adapter.dense_4h_to_h.weight.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.after_attention_adapter.dense_4h_to_h.bias.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_h_to_4h.weight.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_h_to_4h.bias.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_4h_to_h.weight.{self.tensor_para_rank}.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] [ _load_from_file( f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_4h_to_h.bias.bin" # noqa: E501 ) for i in range(self.local_num_layers) ] assert len(self.weights) == len( self.expected_weight_shapes ), "Incorrect number of weights loaded" class FtModuleBase: def __init__(self): self.weight = None @classmethod @abstractmethod def from_config(cls, config: GptInitModelParameters, **kwargs): raise NotImplementedError @abstractmethod def _initialize_model(self, force_init=False): raise NotImplementedError @abstractmethod def forward(self, *args, **kwargs): raise NotImplementedError def set_weight(self, weight: GptLayerWeights): old_weight_dtype = ( self.weight.dtype if self.weight is not None else None ) self.weight = weight if old_weight_dtype is None or old_weight_dtype != self.weight.dtype: self._initialize_model(force_init=True) @property def dtype(self): assert self.weight is not None return self.weight.dtype @property def device(self): assert self.weight is not None return self.weight.device def cuda(self, device=None): assert torch.cuda.is_available() self.weight.cuda(device) return self def to(self, device=None): self.weight.to(device) return self def float(self): self.weight.float() self._initialize_model(force_init=True) return self def half(self): self.weight.half() self._initialize_model(force_init=True) return self def bfloat16(self): self.weight.bfloat16() self._initialize_model(force_init=True) return self class GptContextDecoder(FtModuleBase): def __init__( self, num_heads: int, size_per_head: int, inter_size: int, num_layers: int, tensor_para_size: int = 1, pipeline_para_size: int = 1, remove_padding: bool = True, shared_contexts_ratio: float = 1.0, layernorm_eps: float = 1e-6, layernorm_type: LayernormType = "pre_layernorm", activation_type: str = "gelu", has_adapters: bool = False, adapter_inter_size: int = 0, int8_mode: int = 0, ): super().__init__() self.num_heads = num_heads self.size_per_head = size_per_head self.hidden_size = self.num_heads * self.size_per_head self.inter_size = inter_size self.num_layers = num_layers self.tensor_para_size = tensor_para_size self.pipeline_para_size = pipeline_para_size self.remove_padding = remove_padding self.shared_contexts_ratio = shared_contexts_ratio self.layernorm_eps = layernorm_eps self.layernorm_type = layernorm_type self.activation_type = activation_type self.has_adapters = has_adapters self.adapter_inter_size = adapter_inter_size assert int8_mode in [0, 1] self.int8_mode = int8_mode self.ft_op = None self.weight = None def __repr__(self): args_dict = dict( num_heads=self.num_heads, size_per_head=self.size_per_head, hidden_size=self.hidden_size, inter_size=self.inter_size, num_layers=self.num_layers, tensor_para_size=self.tensor_para_size, pipeline_para_size=self.pipeline_para_size, remove_padding=self.remove_padding, shared_contexts_ratio=self.shared_contexts_ratio, layernorm_eps=self.layernorm_eps, layernorm_type=self.layernorm_type, activation_type=self.activation_type, has_adapters=self.has_adapters, adapter_inter_size=self.adapter_inter_size, int8_mode=self.int8_mode, ) args_str = ",\n ".join([f"{k}: {v}" for k, v in args_dict.items()]) return f"{self.__class__.__name__}[\n{ args_str}\n]" @classmethod def from_config(cls, config: GptInitModelParameters, **kwargs): return cls( num_heads=config.head_num, size_per_head=config.size_per_head, inter_size=4 * config.head_num * config.size_per_head, num_layers=config.layer_num, tensor_para_size=config.tensor_para_size, pipeline_para_size=config.pipeline_para_size, remove_padding=kwargs.get("remove_padding", True), shared_contexts_ratio=kwargs.get("shared_contexts_ratio", 1.0), layernorm_eps=config.layernorm_eps, layernorm_type=config.layernorm_type, activation_type=config.activation_type, has_adapters=config.has_adapters, adapter_inter_size=config.adapter_inter_size, int8_mode=config.int8_mode, ) def _initialize_model(self, force_init=False): if self.weight is None: self.weight = GptLayerWeights( num_heads=self.num_heads, size_per_head=self.size_per_head, inter_size=self.inter_size, num_layers=self.num_layers, tensor_para_size=self.tensor_para_size, pipeline_para_size=self.pipeline_para_size, has_adapters=self.has_adapters, adapter_inter_size=self.adapter_inter_size, int8_mode=self.int8_mode, ) if not force_init and self.ft_op is not None: return if self.ft_op is not None: del self.ft_op self.ft_op = ( torch.classes.FasterTransformer.ParallelGptContextDecoderOp( self.num_heads, self.size_per_head, self.inter_size, self.num_layers, self.tensor_para_size, self.pipeline_para_size, self.layernorm_eps, self.layernorm_type, self.activation_type, self.has_adapters, self.adapter_inter_size, self.int8_mode, self.weight.weights, self.weight.int8_weights, self.weight.int8_scales, self.remove_padding, ) ) def forward( self, input_embeds: torch.Tensor, attention_mask: torch.Tensor, input_lengths: torch.IntTensor, memory_length: Optional[int] = None, compact_index: Optional[torch.IntTensor] = None, batch_to_compact_index: Optional[torch.IntTensor] = None, linear_bias_slopes: Optional[torch.Tensor] = None, ): """ # Args. input_embeds: Tensor, (batch * beam, max_input_length, hidden_dim), input hidden states. attention_mask: Tensor, (batch * beam, max_input_length, max_input_length), input attention mask. input_lengths: (batch * beam,), input sequence lengths. memory_length: int, the length of memory to keep key/cache values. compact_index: IntTensor, (compact_batch_size,) The index of input sequences of a compact batch. If None, the FT op doesn't apply the shared context feature and as result the inference time may increase. batch_to_compact_index: IntTensor, (batch * beam,) The index map from the original input batch to the compact batch. This must be provided if compact_index is not None. linear_bias_slopes: (num_heads,) The slope per head of linear attention bias - ALiBi. If None, a base self attention will be performed. # Returns hidden_states: Tensor, (batch * beam, max_input_length, hidden_dim), decoder outputs. key_cache: Tensor, (num_layers, batch * beam, local_num_heads, size_per_head / x, memory_length, x), # noqa: E501 key cache of attention of inputs. x = 16 / sizeof(T), memory_length = max_input_length or max_input_length + gen_length # noqa: E501 value_cache: Tensor, (num_layers, batch * beam, local_num_heads, memory_length, hidden_dim) # noqa: E501 value cache of attention last_token_hidden_states: Tensor, (batch * beam, hidden_dim) hidden states of the last input token. """ self._initialize_model() # outputs: output hidden states ( decoder_ouptut, key_cache, value_cache, last_token_hidden_states, ) = self.ft_op.forward( input_embeds, attention_mask, input_lengths, memory_length, compact_index, batch_to_compact_index, linear_bias_slopes, ) return decoder_ouptut, key_cache, value_cache, last_token_hidden_states class GptDecoder(FtModuleBase): def __init__( self, num_heads: int, size_per_head: int, inter_size: int, num_layers: int, tensor_para_size: int = 1, pipeline_para_size: int = 1, layernorm_eps: float = 1e-6, layernorm_type: LayernormType = "pre_layernorm", activation_type: str = "gelu", has_adapters: bool = False, adapter_inter_size: int = 0, int8_mode: int = 0, ): super().__init__() self.num_heads = num_heads self.size_per_head = size_per_head self.hidden_size = self.num_heads * self.size_per_head self.inter_size = inter_size self.num_layers = num_layers self.tensor_para_size = tensor_para_size self.pipeline_para_size = pipeline_para_size self.layernorm_eps = layernorm_eps self.layernorm_type = layernorm_type self.activation_type = activation_type self.has_adapters = has_adapters self.adapter_inter_size = adapter_inter_size self.int8_mode = int8_mode self.ft_op = None self.weight = None def __repr__(self): args_dict = dict( num_heads=self.num_heads, size_per_head=self.size_per_head, hidden_size=self.hidden_size, inter_size=self.inter_size, num_layers=self.num_layers, tensor_para_size=self.tensor_para_size, pipeline_para_size=self.pipeline_para_size, layernorm_eps=self.layernorm_eps, layernorm_type=self.layernorm_type, activation_type=self.activation_type, has_adapters=self.has_adapters, adapter_inter_size=self.adapter_inter_size, int8_mode=self.int8_mode, ) args_str = ",\n ".join( [f"{k}: {v}" for k, v in args_dict.items()] ) # noqa: E501 return f"{self.__class__.__name__}[\n {args_str}\n]" @classmethod def from_config(cls, config: GptInitModelParameters, **kwargs): hidden_dim = config.head_num * config.size_per_head return cls( num_heads=config.head_num, size_per_head=config.size_per_head, inter_size=4 * hidden_dim, num_layers=config.layer_num, tensor_para_size=config.tensor_para_size, pipeline_para_size=config.pipeline_para_size, layernorm_eps=config.layernorm_eps, layernorm_type=config.layernorm_type, activation_type=config.activation_type, has_adapters=config.has_adapters, adapter_inter_size=config.adapter_inter_size, int8_mode=config.int8_mode, ) def _initialize_model(self, force_init=False): if self.weight is None: self.weight = GptLayerWeights( num_heads=self.num_heads, size_per_head=self.size_per_head, inter_size=self.inter_size, num_layers=self.num_layers, tensor_para_size=self.tensor_para_size, pipeline_para_size=self.pipeline_para_size, has_adapters=self.has_adapters, adapter_inter_size=self.adapter_inter_size, int8_mode=self.int8_mode, ) if not force_init and self.ft_op is not None: return if self.ft_op is not None: del self.ft_op self.ft_op = torch.classes.FasterTransformer.ParallelGptDecoderOp( self.num_heads, self.size_per_head, self.inter_size, self.num_layers, self.tensor_para_size, self.pipeline_para_size, self.layernorm_eps, self.layernorm_type, self.activation_type, self.has_adapters, self.adapter_inter_size, self.weight.int8_mode, self.weight.weights, self.weight.int8_weights, self.weight.int8_scales, ) def forward( self, max_input_length: int, step: int, ite: int, input_embeds: torch.Tensor, sequence_lengths: torch.IntTensor, key_cache: torch.Tensor, value_cache: torch.Tensor, finished: torch.BoolTensor, total_padding_tokens: torch.IntTensor, masked_tokens: torch.BoolTensor, cache_indirection: Optional[torch.IntTensor] = None, linear_bias_slopes: Optional[torch.Tensor] = None, ): """ # Args. max_input_length: int, maximum input context length. step: int, the current step index. ite: int, local batch iteration. input_embeds: Tensor, (local_batch * beam, hidden_dim), input hidden state to decoder. sequence_lengths: IntTensor, (local_batch * beam,), the current sequence lengths. key_cache: Tensor, key cache buffer. value_cache: Tensor, value cache buffer. finished: BoolTensor, (local_batch * beam,), whether to finish sentence generation. total_padding_tokens IntTensor, (local_batch * beam,), the number of padded tokens. masked_tokens: BoolTensor, (local_batch * beam, memory_length), a mask tensor that indicates padded tokens. cache_indirection: IntTensor, (local_batch * beam,), cache of beam positions if needed if beam > 1. linear_bias_slopes Tensor, (num_heads,) slopes head of linear position bias (ALiBi) (optional). # Returns IntTensor, (batch * beam,) output token ids. """ self._initialize_model() outputs = self.ft_op.forward( max_input_length, step, ite, input_embeds, sequence_lengths, finished, total_padding_tokens, masked_tokens, key_cache, value_cache, cache_indirection, linear_bias_slopes, ) return outputs[0] class Gpt: def __init__( self, num_heads: int, size_per_head: int, num_layers: int, vocab_size: int, start_id: int, end_id: int, lib_path: PathLike, tensor_para_size: int = 1, pipeline_para_size: int = 1, remove_padding: bool = True, shared_contexts_ratio: float = 1.0, layernorm_eps: float = 1e-6, layernorm_type: LayernormType = "pre_layernorm", activation_type: str = "gelu", has_positional_encoding: bool = True, max_seq_len: int = 0, has_pre_decoder_layernorm: bool = False, has_post_decoder_layernorm: bool = True, has_adapters: bool = False, adapter_inter_size: int = 0, int8_mode: int = 0, inference_data_type: Optional[str] = None, weights_data_type: str = "fp32", use_fp32_to_compute_logit: bool = False, **kwargs, ): super().__init__() inference_data_type = inference_data_type or weights_data_type self.config = GptInitModelParameters( head_num=num_heads, size_per_head=size_per_head, layer_num=num_layers, max_seq_len=max_seq_len, tensor_para_size=tensor_para_size, vocab_size=vocab_size, start_id=start_id, end_id=end_id, pipeline_para_size=pipeline_para_size, data_type=inference_data_type, weights_data_type=weights_data_type, layernorm_eps=layernorm_eps, layernorm_type=layernorm_type, activation_type=activation_type, has_positional_encoding=has_positional_encoding, has_pre_decoder_layernorm=has_pre_decoder_layernorm, has_post_decoder_layernorm=has_post_decoder_layernorm, has_adapters=has_adapters, adapter_inter_size=adapter_inter_size, int8_mode=int8_mode, sparse=kwargs.get("sparse", False), ) self.use_fp32_to_compute_logit = use_fp32_to_compute_logit self.weight = None self.shared_contexts_ratio = shared_contexts_ratio torch.classes.load_library(os.path.abspath(lib_path)) # Embeddings to encode or decode tokens. hidden_dim = num_heads * size_per_head # Pad vocab size for FT. local_vocab_size = math.ceil( self.config.vocab_size / self.config.tensor_para_size ) if self.config.data_type == "fp16": local_vocab_size = math.ceil(local_vocab_size / 8) * 8 self.vocab_size_padded = ( local_vocab_size * self.config.tensor_para_size ) self.vocab_size = self.config.vocab_size self.decode_op = torch.classes.FasterTransformer.DynamicDecodeOp( self.vocab_size, self.vocab_size_padded, self.config.tensor_para_size, self.config.pipeline_para_size, torch.float, ) self._parameters = {} def register_param(name, p): self._parameters[name] = p setattr(self, name, p) register_param( "context_decoder", GptContextDecoder.from_config( self.config, remove_padding=remove_padding, shared_contexts_ratio=shared_contexts_ratio, **kwargs, ), ) register_param( "decoder", GptDecoder.from_config(self.config, **kwargs) ) compute_dtype = to_torch_dtype(inference_data_type) if comm.is_pipeline_group_first(): register_param( "word_embedding", torch.nn.Embedding( self.vocab_size_padded, hidden_dim, dtype=compute_dtype ), ) self._mask_padded_vocab_weights(self.word_embedding.weight) if self.config.has_positional_encoding: register_param( "position_encoding", torch.nn.Embedding( self.config.max_seq_len, hidden_dim, dtype=compute_dtype, ), ) else: self.position_encoding = None if self.config.has_pre_decoder_layernorm: register_param( "pre_decoder_layernorm", torch.nn.LayerNorm( hidden_dim, eps=layernorm_eps, dtype=compute_dtype ), ) else: self.pre_decoder_layernorm = None if comm.is_pipeline_group_last(): if has_post_decoder_layernorm: register_param( "post_decoder_layernorm", torch.nn.LayerNorm( hidden_dim, eps=layernorm_eps, dtype=compute_dtype ), ) else: self.post_decoder_layernorm = None self.lm_head_ctype = ( compute_dtype if not self.use_fp32_to_compute_logit else torch.float32 ) register_param( "lm_head", torch.nn.Linear( hidden_dim, self.vocab_size_padded, bias=False, dtype=self.lm_head_ctype, ), ) self._mask_padded_vocab_weights(self.lm_head.weight) @classmethod def from_config(cls, config: GptInitModelParameters, **kwargs): return cls( num_heads=config.head_num, size_per_head=config.size_per_head, num_layers=config.layer_num, max_seq_len=config.max_seq_len, tensor_para_size=config.tensor_para_size, vocab_size=config.vocab_size, start_id=config.start_id, end_id=config.end_id, pipeline_para_size=config.pipeline_para_size, inference_data_type=config.data_type, weights_data_type=config.weights_data_type, layernorm_eps=config.layernorm_eps, layernorm_type=config.layernorm_type, activation_type=config.activation_type, has_positional_encoding=config.has_positional_encoding, has_pre_decoder_layernorm=config.has_pre_decoder_layernorm, has_post_decoder_layernorm=config.has_post_decoder_layernorm, has_adapters=config.has_adapters, adapter_inter_size=config.adapter_inter_size, int8_mode=config.int8_mode, **kwargs, ) def load( self, checkpoint_path: PathLike, inference_data_type: Optional[Union[str, torch.dtype]] = None, config: Optional[GptInitModelParameters] = None, device: Optional[Union[str, int, torch.device]] = None, ): checkpoint_path = Path(checkpoint_path) device = device or comm.get_device() config = config or self.config compute_dtype = to_torch_dtype(inference_data_type or self.dtype) self.weight = GptLayerWeights.from_config(config) self.weight.load( checkpoint_path, compute_dtype, config.weights_data_type, device ) self.context_decoder.set_weight(self.weight) self.decoder.set_weight(self.weight) weight_dtype = to_numpy_dtype(config.weights_data_type) def _safe_load_from_bin(param: torch.nn.Parameter, fname): if (checkpoint_path / fname).exists(): # np_w is 1-D array since a bin file doesn't have shape info. w_ = np.fromfile(checkpoint_path / fname, dtype=weight_dtype) param.data = ( torch.from_numpy(w_) .reshape(param.data.shape) .to(compute_dtype) ) else: raise FileNotFoundError(f"Faile to load {fname}") def _safe_load_lm_head_from_bin(param, fname, ctype): if (checkpoint_path / fname).exists(): shape = ( self.vocab_size, self.config.head_num * self.config.size_per_head, ) # np_w is 1-D array since a bin file doesn't have shape info. w_ = np.fromfile(checkpoint_path / fname, dtype=weight_dtype) param.data = param.data.to(ctype) param.data[: self.vocab_size, :] = ( torch.from_numpy(w_).reshape(shape).to(ctype) ) else: print(f"Faile to load {fname}") torch.nn.init.normal_(param).to(compute_dtype) self._mask_padded_vocab_weights(param) # pylint:disable=line-too-long if comm.is_pipeline_group_first(): _safe_load_lm_head_from_bin( self.word_embedding.weight, "model.wte.bin", compute_dtype ) self._mask_padded_vocab_weights(self.word_embedding.weight) if self.position_encoding is not None: _safe_load_from_bin( self.position_encoding.weight, "model.wpe.bin" ) if self.pre_decoder_layernorm is not None: _safe_load_from_bin( self.pre_decoder_layernorm.weight, "model.pre_decoder_layernorm.weight.bin", ) _safe_load_from_bin( self.pre_decoder_layernorm.bias, "model.pre_decoder_layernorm.bias.bin", ) if comm.is_pipeline_group_last(): if self.post_decoder_layernorm is not None: _safe_load_from_bin( self.post_decoder_layernorm.weight, "model.final_layernorm.weight.bin", ) _safe_load_from_bin( self.post_decoder_layernorm.bias, "model.final_layernorm.bias.bin", ) if (checkpoint_path / "model.lm_head.weight.bin").exists(): _safe_load_lm_head_from_bin( self.lm_head.weight, "model.lm_head.weight.bin", self.lm_head_ctype, ) else: if self.use_fp32_to_compute_logit: _safe_load_lm_head_from_bin( self.lm_head.weight, "model.wte.bin", torch.float32 ) else: # In this branch we can share the pre and post # decoder embeddings, but ONLY pipeline size is 1. # When pipeline size > 1, these two weights will end up on # different GPUs, so we must load the # post decoder weight again (else case). if comm.get_pipeline_para_size() == 1: self.lm_head.weight = self.word_embedding.weight else: _safe_load_lm_head_from_bin( self.lm_head.weight, "model.wte.bin", compute_dtype ) self.to(device) @property def dtype(self): assert self.weight is not None return self.weight.dtype @property def device(self): assert self.weight is not None return self.weight.device def cuda(self, device=None): assert torch.cuda.is_available() for name, param in self._parameters.items(): setattr(self, name, param.cuda(device)) return self def to(self, device=None): for name, param in self._parameters.items(): setattr(self, name, param.to(device)) return self def float(self): for name, param in self._parameters.items(): setattr(self, name, param.float()) return self def half(self): for name, param in self._parameters.items(): setattr(self, name, param.half()) return self def bfloat16(self): for name, param in self._parameters.items(): setattr(self, name, param.bfloat16()) return self def _mask_padded_vocab_weights(self, weight: torch.Tensor): assert self.vocab_size_padded >= self.vocab_size if self.vocab_size_padded > self.vocab_size: weight.data[self.vocab_size :, ...] = 0 # noqa: E203 def generate_pad_mask(self, input_lengths, memory_length, init_step=0): """Generate a pad mask tensor. # Args. input_lengths: (batch_size * beam_width,), input lengths memory_length: the length of key/value cache memory. init_step: int, initial step. # Return masked_tokens: BoolTensor, (batch_size * beam_width, memory_length), True if init_step + input_length[i] <= j < init_step + max_input_length, where i is a batch-beam index and j is a time step modulo by memory_length. """ max_input_length = input_lengths.max() input_lengths = input_lengths.unsqueeze(1) shift = init_step % memory_length step_indices = torch.arange( init_step, init_step + memory_length, device=input_lengths.device ) step_indices = ( step_indices.roll(shift) .unsqueeze(0) .tile(input_lengths.shape[0], 1) ) masked_tokens = torch.logical_and( step_indices >= input_lengths, step_indices < init_step + max_input_length, ) return masked_tokens def get_local_batch_size(self, batch_size): """Get a local batch size by the same way that FT Gpt does.""" local_batch_size = batch_size pp_size = self.decoder.pipeline_para_size if pp_size > 1: if local_batch_size % pp_size == 0: local_batch_size //= pp_size while local_batch_size > 1024 and local_batch_size % 2 == 0: local_batch_size //= 2 return local_batch_size @torch.no_grad() def generate( self, input_token_ids: torch.IntTensor, input_lengths: torch.IntTensor, gen_length: int, eos_token_id: Optional[int] = None, local_batch_size: Optional[int] = None, beam_width: int = 1, top_k: Optional[torch.IntTensor] = None, top_p: Optional[torch.FloatTensor] = None, top_p_decay: Optional[torch.FloatTensor] = None, top_p_min: Optional[torch.FloatTensor] = None, top_p_reset_ids: Optional[torch.IntTensor] = None, temperature: Optional[torch.FloatTensor] = None, repetition_penalty: Optional[torch.FloatTensor] = None, presence_penalty: Optional[torch.FloatTensor] = None, min_length: Optional[torch.IntTensor] = None, len_penalty: Optional[torch.FloatTensor] = None, beam_search_diversity_rate: Optional[torch.FloatTensor] = None, stop_words_list: Optional[torch.IntTensor] = None, bad_words_list: Optional[torch.IntTensor] = None, sequence_limit_lengths: Optional[torch.IntTensor] = None, random_seed: Optional[torch.LongTensor] = None, memory_length: Optional[int] = None, return_output_length: bool = False, return_log_probs: bool = False, ): """ # Args. input_token_ids: IntTensor, (batch_size, max_input_length), input hidden state to decoder. input_lengths: IntTensor, (batch_size), the lengths of input context sequences. gen_length: int, the number of tokens to generate. local_batch_size: int, optional, a batch size of local iteration. (disabled) eos_token_id: int, eos token id. beam_width: int, number of beams for beam search. If 1, sampling decode will be used. top_k: IntTensor, (batch_size,) top-k sampling. The number of most probable tokens to keep for sampling per sentence in a batcch. top_p: FloatTensor, (batch_size,), top-p sampling. The cumulative probability of to filter the set of most probable tokens. top_p_decay: FloatTensor, (batch_size,) The decay of top-p value for top_p sampling. top_p_min: FloatTensor, (batch_size,) The minimum top p values in top-p decaying. top_p_reset_ids: IntTensor, (batch_size,) reset ids for resetting top_p values for top p sampling temperature: FloatTensor, (batch_size,), The temperature value for smoothing the logit distribution. repetition_penalty: FloatTensor, (batch_size,), The repetition penalty. presence_penalty: FloatTensor, (batch_size,), The presence penalty, which is exclusive with repetition_penalty. Only one of repetition and presence penalties is allowed. min_length: IntTensor, (batch_size,), Minimum length for each sentences. EOS is masked if length is below min. len_penalty: FloatTensor, (batch_size,) The exponent of the length penalty of beam scores. beam_search_diversity_rate: FloatTensor, (batch_size,), The diversity rate of beam search. stop_words_list: IntTensor, (batch_size, 2, stop_words_length) When FT generates words in this list, it will stop the generation. An extension of stop id. bad_words_list IntTensor, (batch_size, 2, bad_words_length) The words in the list will never be sampled. sequence_limit_lengths: IntTensor, (batch_size,), The maximum length of a generated sequence. memory_length: int, the length of cache memory. If None, it will be max_input_length + gen_length. # Returns IntTensor, (batch_size, beam_width, max_seq_length) output token ids. """ assert ( self.weight is not None ), "Please call load() first to initialize weights." input_token_ids = input_token_ids.type(torch.int32).to(self.device) input_lengths = input_lengths.type(torch.int32).to(self.device) batch_size = len(input_token_ids) max_input_length = input_token_ids.shape[-1] max_seq_length = max_input_length + gen_length memory_length = memory_length or max_seq_length # TODO: Enable local batch later. We currently disable local batching due to # noqa: E501 # an input mismatch issue of FT's decode_op: FT's decode_op requires logits # noqa: E501 # of shape (batch_size, ...) but we have logits of shape (local_batch_size, ...) # noqa: E501 # After fixing FT's side, we will enable local batch. # local_batch_size = local_batch_size or self.get_local_batch_size(batch_size) # noqa: E501 # num_local_batches, last_chunk = divmod(batch_size, local_batch_size) # if last_chunk > 0: # num_local_batches += 1 assert local_batch_size is None or local_batch_size == batch_size local_batch_size = batch_size num_local_batches = 1 device = self.device eos_token_id = ( eos_token_id if eos_token_id is not None else self.config.end_id ) assert ( eos_token_id is not None ), "eos_token-id must be specified in generation." eos_token_ids = eos_token_id * torch.ones( batch_size, dtype=torch.int32, device=device ) assert repetition_penalty is None or presence_penalty is None, ( "Found ambiguous parameters repetition_penalty and " "presence_penalty which are mutually exclusive. " "Please provide one of repetition_penalty and presence_penalty." ) # Setup decoder_op prior to calling the forward function. self.decode_op.setup( batch_size, beam_width, top_k, top_p, temperature, repetition_penalty, presence_penalty, min_length, len_penalty, beam_search_diversity_rate, random_seed, top_p_decay, top_p_min, top_p_reset_ids, ) # Prepare input and output arguments. if beam_width > 1: # Tiling for beam search. input_token_ids = input_token_ids.repeat(1, beam_width).view( batch_size * beam_width, -1 ) input_lengths = ( input_lengths.view(-1, 1).repeat(1, beam_width).view(-1) ) if sequence_limit_lengths is not None: sequence_limit_lengths = ( sequence_limit_lengths.view(-1, 1) .repeat(1, beam_width) .view(-1) ) # src/tgt cache indirections. cache_indirection = torch.zeros( (2, batch_size, beam_width, memory_length), dtype=torch.int32, device=device, ) parent_ids = torch.zeros( max_seq_length, batch_size * beam_width, dtype=torch.int32, device=device, ) else: cache_indirection = None src_cache_indirection = None tgt_cache_indirection = None parent_ids = None pad_lengths = max_input_length - input_lengths # Since tril() doesn't support bf16 dtype, # we create of bool type and then cast it to dtype. attention_mask = ( torch.ones( (max_input_length, max_input_length), dtype=torch.bool, device=device, ) .tril() .unsqueeze(0) .tile(input_token_ids.shape[0], 1, 1) .to(self.dtype) ) for b, input_length in enumerate(input_lengths): attention_mask[b, input_length:, ...] = 0 masked_tokens = self.generate_pad_mask(input_lengths, memory_length) finished = torch.zeros_like(input_lengths).bool() sequence_lengths = (max_input_length - 1) * torch.ones_like( input_lengths ) if return_log_probs or beam_width > 1: cum_log_probs = torch.zeros(batch_size * beam_width, device=device) output_log_probs = torch.zeros( (gen_length, batch_size * beam_width), device=device ) else: cum_log_probs = None output_log_probs = None # Contiguous buffer for each decode_op step, # it will be transposed tensor for the final output. output_token_ids = torch.zeros( (max_seq_length, batch_size * beam_width), dtype=torch.int32, device=device, ) output_token_ids[:max_input_length, ...] = input_token_ids.T if comm.is_pipeline_group_first(): # Prepare input tensors of decoder. input_embeds = self.word_embedding(input_token_ids) if self.position_encoding is not None: position_ids = torch.arange( 0, max_input_length, dtype=torch.int, device=device ) position_ids = position_ids.unsqueeze(0).view( -1, max_input_length ) input_embeds += self.position_encoding(position_ids) if self.pre_decoder_layernorm is not None: input_embeds = self.pre_decoder_layernorm(input_embeds) else: # Dummy input_embeds input_embeds = torch.empty( size=( batch_size * beam_width, max_input_length, self.context_decoder.hidden_size, ), dtype=self.context_decoder.dtype, device=device, ) use_shared_contexts = ( (self.shared_contexts_ratio > 0.0) and (max_input_length >= 1) and (batch_size > 1) ) batch_to_compact, compact_to_batch = None, None if use_shared_contexts: find_context_duplications = ( torch.ops.fastertransformer.find_context_duplications ) batch_to_compact, compact_to_batch = find_context_duplications( input_token_ids ) use_shared_contexts = ( compact_to_batch.shape[0] <= self.shared_contexts_ratio * batch_size ) if not use_shared_contexts: batch_to_compact, compact_to_batch = None, None profiler.start("ft-context-decoder") ( _, k_cache, v_cache, last_token_hidden_states, ) = self.context_decoder.forward( input_embeds=input_embeds, attention_mask=attention_mask, input_lengths=input_lengths, memory_length=memory_length, batch_to_compact_index=batch_to_compact, compact_index=compact_to_batch, ) profiler.stop("ft-context-decoder") for step in range(max_input_length, max_seq_length): src_indir_idx = (step - max_input_length) % 2 tgt_indir_idx = 1 - src_indir_idx is_generation_done = torch.tensor( [True], dtype=torch.bool, device=device ) for ite in range(num_local_batches): # The indices of the current local batch-beam. bbidx = range( ite * local_batch_size * beam_width, min( (ite + 1) * local_batch_size * beam_width, batch_size * beam_width, ), ) if cache_indirection is not None: bidx = range( ite * local_batch_size, min((ite + 1) * local_batch_size, batch_size), ) src_cache_indirection = cache_indirection[ src_indir_idx, bidx, ... ] tgt_cache_indirection = cache_indirection[ tgt_indir_idx, bidx, ... ] if step == max_input_length: hidden_states = last_token_hidden_states[bbidx, ...] else: if comm.is_pipeline_group_first(): input_embeds = self.word_embedding( output_token_ids[step - 1, bbidx] ) if self.position_encoding is not None: position_ids = (step - 1) * torch.ones_like( pad_lengths[bbidx] ) input_embeds += self.position_encoding( position_ids ) if self.pre_decoder_layernorm is not None: input_embeds = self.pre_decoder_layernorm( input_embeds ) else: # Dummy input_imbeds input_embeds = torch.empty( size=(len(bbidx), self.decoder.hidden_size), dtype=self.decoder.dtype, device=device, ) profiler.start("ft-decoder") hidden_states = self.decoder.forward( max_input_length=max_input_length, step=step, ite=ite, input_embeds=input_embeds, sequence_lengths=sequence_lengths[bbidx], key_cache=k_cache, value_cache=v_cache, finished=finished[bbidx], total_padding_tokens=pad_lengths[bbidx], cache_indirection=src_cache_indirection, masked_tokens=masked_tokens[bbidx, ...], ) profiler.stop("ft-decoder") if comm.is_pipeline_group_last(): if self.post_decoder_layernorm is not None: hidden_states = self.post_decoder_layernorm( hidden_states ) # We use logits of fp32 type to avoid overflow issue. if self.use_fp32_to_compute_logit: # The FT GPT op internally uses FP32 compute type # for matrix multiplication. # This will produce the same result with the # end-to-end FT's GPT op. logits = torch.nn.functional.linear( hidden_states.float(), self.lm_head.weight ) else: logits = self.lm_head(hidden_states).float() profiler.start("ft-decode") should_stop = self.decode_op.forward( logits.view(batch_size, beam_width, -1), step, max_input_length, ite, local_batch_size, eos_token_ids, top_k, top_p, temperature, repetition_penalty, presence_penalty, min_length, len_penalty, beam_search_diversity_rate, top_p_decay, top_p_min, top_p_reset_ids, None, input_lengths, sequence_limit_lengths, stop_words_list, bad_words_list, src_cache_indirection, output_token_ids.view(-1, batch_size, beam_width), finished, sequence_lengths, cum_log_probs, output_log_probs, parent_ids, tgt_cache_indirection, ) profiler.stop("ft-decode") is_generation_done &= should_stop # Broadcast from the last pipeline node if needed. profiler.start("ft-bcast") tensors_to_bcast = [ output_token_ids[step, ...], finished, sequence_lengths, is_generation_done, ] if beam_width > 1: tensors_to_bcast.append(tgt_cache_indirection) self.decode_op.broadcast_from_last_pipeline(tensors_to_bcast) profiler.stop("ft-bcast") if is_generation_done or finished.all(): break # Transpose (L, batch, beam) -> (batch, beam, L) output_token_ids = output_token_ids.view( -1, batch_size, beam_width ).permute(1, 2, 0) # Increase sequence_length by 1 because the sequence length of time step t is t - 1. # noqa: E501 sequence_lengths += 1 # Outputs output_dict = dict(output_token_ids=output_token_ids) if return_output_length: output_dict["output_lengths"] = sequence_lengths if return_log_probs: output_dict["cum_log_probs"] = cum_log_probs output_dict["output_log_probs"] = output_log_probs return output_dict ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/huggingface_gpt_convert.py ================================================ # Based on https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/utils/huggingface_gpt_convert.py # noqa: E501 # Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Convert huggingface GPT model. Use https://huggingface.co/gpt2 as demo. """ import argparse import configparser import os import sys from loguru import logger import numpy as np from transformers import GPT2Model # transformers-4.10.0-py3 from nebullvm.optional_modules.torch import torch dir_path = os.path.dirname(os.path.realpath(__file__)) sys.path.append(dir_path + "/../../../..") sys.path.append(dir_path) def get_weight_data_type(data_type): if data_type == "fp32": return np.float32 elif data_type == "fp16": return np.float16 else: assert False, f"Invalid weight data type {data_type}" def split_and_convert_process(i, saved_dir, factor, key, args, val): if ( key.find("input_layernorm.weight") != -1 or key.find("input_layernorm.bias") != -1 or key.find("attention.dense.bias") != -1 or key.find("post_attention_layernorm.weight") != -1 or key.find("post_attention_layernorm.bias") != -1 or key.find("mlp.dense_4h_to_h.bias") != -1 or key.find("final_layernorm.weight") != -1 or key.find("final_layernorm.bias") != -1 ): # shared weights, only need to convert the weights of rank 0 if i == 0: saved_path = saved_dir + "/model." + key + ".bin" val.tofile(saved_path) elif ( key.find("attention.dense.weight") != -1 or key.find("mlp.dense_4h_to_h.weight") != -1 ): split_vals = np.split(val, factor, axis=0) for j in range(factor): saved_path = ( saved_dir + "/model." + key + ".%d.bin" % (i * factor + j) ) split_vals[j].tofile(saved_path) elif ( key.find("mlp.dense_h_to_4h.weight") != -1 or key.find("mlp.dense_h_to_4h.bias") != -1 ): split_vals = np.split(val, factor, axis=-1) for j in range(factor): saved_path = ( saved_dir + "/model." + key + ".%d.bin" % (i * factor + j) ) split_vals[j].tofile(saved_path) elif key.find("attention.query_key_value.bias") != -1: local_dim = (int)(val.shape[-1] / 3) val = val.reshape(3, local_dim) split_vals = np.split(val, factor, axis=-1) for j in range(factor): saved_path = ( saved_dir + "/model." + key + ".%d.bin" % (i * factor + j) ) split_vals[j].tofile(saved_path) elif key.find("attention.query_key_value.weight") != -1: hidden_dim = val.shape[0] local_dim = (int)(val.shape[-1] / 3) val = val.reshape(hidden_dim, 3, local_dim) split_vals = np.split(val, factor, axis=-1) for j in range(factor): saved_path = ( saved_dir + "/model." + key + ".%d.bin" % (i * factor + j) ) split_vals[j].tofile(saved_path) else: logger.warning("[ERROR] cannot find key '{}'".format(key)) def split_and_convert(args): torch_device = "cuda" if torch.cuda.is_available() else "cpu" model = GPT2Model.from_pretrained(args.in_file).to(torch_device) main( args.saved_dir, model, args.trained_gpu_num, args.infer_gpu_num, args.processes, args.weight_data_type, ) def main( saved_dir, model: GPT2Model, trained_gpu_num=1, infer_gpu_num=1, processes=1, weight_data_type="fp32", ): assert isinstance(model, GPT2Model), "model must be GPT2Model" args = None saved_dir = saved_dir + "/%d-gpu/" % infer_gpu_num if not os.path.exists(saved_dir): os.makedirs(saved_dir) # ckpt_name = args.in_file t_gpu_num = trained_gpu_num i_gpu_num = infer_gpu_num assert i_gpu_num % t_gpu_num == 0 factor = (int)(i_gpu_num / t_gpu_num) # load position_embedding from rank 0 # torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' # model = GPT2Model.from_pretrained(args.in_file).to(torch_device) hf_config = vars(model.config) # NOTE: save parameters to config files (loaded by triton backends) config = configparser.ConfigParser() config["gpt"] = {} try: config["gpt"]["model_name"] = ( "gpt" if hf_config["_name_or_path"] == "" else hf_config["_name_or_path"] ) config["gpt"]["head_num"] = str(hf_config["n_head"]) n_embd = hf_config["n_embd"] config["gpt"]["size_per_head"] = str(n_embd // hf_config["n_head"]) config["gpt"]["inter_size"] = str(n_embd * 4) config["gpt"]["max_pos_seq_len"] = str(hf_config["n_positions"]) config["gpt"]["num_layer"] = str(hf_config["n_layer"]) config["gpt"]["vocab_size"] = str(hf_config["vocab_size"]) config["gpt"]["start_id"] = str(hf_config["bos_token_id"]) config["gpt"]["end_id"] = str(hf_config["eos_token_id"]) config["gpt"]["weight_data_type"] = weight_data_type with open(saved_dir + "/config.ini", "w") as configfile: config.write(configfile) except: # noqa: E722 logger.warning("Fail to save the config in config.ini.") np_weight_data_type = get_weight_data_type(weight_data_type) huggingface_model_name_pattern = [ "ln_1.bias", "ln_1.weight", "attn.c_attn.bias", "attn.c_attn.weight", "attn.c_proj.bias", "attn.c_proj.weight", "ln_2.bias", "ln_2.weight", "mlp.c_fc.bias", "mlp.c_fc.weight", "mlp.c_proj.bias", "mlp.c_proj.weight", ] ft_model_name_pattern = [ "input_layernorm.bias", "input_layernorm.weight", "attention.query_key_value.bias", "attention.query_key_value.weight", "attention.dense.bias", "attention.dense.weight", "post_attention_layernorm.bias", "post_attention_layernorm.weight", "mlp.dense_h_to_4h.bias", "mlp.dense_h_to_4h.weight", "mlp.dense_4h_to_h.bias", "mlp.dense_4h_to_h.weight", ] # torch.multiprocessing.set_start_method("spawn") # torch.multiprocessing.set_sharing_strategy("file_system") # pool = multiprocessing.Pool(args.processes) for name, param in model.named_parameters(): if name.find("weight") == -1 and name.find("bias") == -1: continue if name == "wpe.weight": param.detach().cpu().numpy().astype(np_weight_data_type).tofile( saved_dir + "model.wpe.bin" ) elif name == "wte.weight": param.detach().cpu().numpy().astype(np_weight_data_type).tofile( saved_dir + "model.wte.bin" ) elif name == "ln_f.bias": param.detach().cpu().numpy().astype(np_weight_data_type).tofile( saved_dir + "model.final_layernorm.bias.bin" ) elif name == "ln_f.weight": param.detach().cpu().numpy().astype(np_weight_data_type).tofile( saved_dir + "model.final_layernorm.weight.bin" ) elif name == "lm_head.weight": param.detach().cpu().numpy().astype(np_weight_data_type).tofile( saved_dir + "model.lm_head.weight.bin" ) else: for i in range(len(huggingface_model_name_pattern)): if name.find(huggingface_model_name_pattern[i]) != -1: new_name = name.replace("h.", "layers.").replace( huggingface_model_name_pattern[i], ft_model_name_pattern[i], ) # pool.starmap(split_and_convert_process, # [(0, saved_dir, factor, new_name, args, # param.detach().cpu().numpy().astype(np_weight_data_type))], # ) split_and_convert_process( 0, saved_dir, factor, new_name, args, param.detach() .cpu() .numpy() .astype(np_weight_data_type), ) # pool.close() # pool.join() if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument( "-saved_dir", "-o", type=str, help="file name of output file", required=True, ) parser.add_argument( "-in_file", "-i", type=str, help="file name of input checkpoint file", required=True, ) parser.add_argument( "-trained_gpu_num", "-t_g", type=int, help="How many gpus for inference", default=1, ) parser.add_argument( "-infer_gpu_num", "-i_g", type=int, help="How many gpus for inference", required=True, ) parser.add_argument( "-processes", "-p", type=int, help="How many processes to spawn for conversion (default: 4)", default=4, ) parser.add_argument( "-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"] ) args = parser.parse_args() logger.info("\n=============== Argument ===============") for key in vars(args): logger.info("{}: {}".format(key, vars(args)[key])) logger.info("========================================") split_and_convert(args) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/intel_neural_compressor.py ================================================ from pathlib import Path from typing import Union from nebullvm.core.models import QuantizationType from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.quantizations.intel_neural_compressor import ( # noqa: E501 quantize_neural_compressor, ) from nebullvm.operations.optimizations.compilers.quantizations.utils import ( check_quantization, ) from nebullvm.optional_modules.torch import Module from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation class IntelNeuralCompressorCompiler(Compiler): supported_ops = { "cpu": [ QuantizationType.STATIC, QuantizationType.DYNAMIC, ], "gpu": [], } def __init__(self): super().__init__() self.model_orig = None def execute( self, model: Module, input_tfms: MultiStageTransformation = None, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Compile the input model using IntelNeuralCompressor library. Args: model (torch.nn.Module): The pytorch model. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) train_input_data = input_data.get_split("train") self.model_orig = model if quantization_type is not None: quantized_model = self._quantize_model( model, quantization_type, input_tfms, train_input_data ) self.compiled_model = self._compile_model(quantized_model) def _compile_model(self, model: Union[str, Path]): return model @staticmethod def _quantize_model( model: Module, quantization_type: QuantizationType, input_tfms: MultiStageTransformation, input_data: DataManager, ): return quantize_neural_compressor( model, quantization_type, input_tfms, input_data ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/onnxruntime.py ================================================ from pathlib import Path from typing import Union, List, Tuple import numpy as np from nebullvm.config import QUANTIZATION_DATA_NUM from nebullvm.core.models import QuantizationType from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.quantizations.onnx import ( quantize_onnx, ) from nebullvm.operations.optimizations.compilers.quantizations.utils import ( check_quantization, ) from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation class ONNXCompiler(Compiler): supported_ops = { "cpu": [ None, QuantizationType.STATIC, QuantizationType.DYNAMIC, ], "gpu": [ None, QuantizationType.HALF, ], } def execute( self, model: str, input_tfms: MultiStageTransformation = None, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Compile the input model using ONNX Runtime Compiler. Args: model (str): The onnx model path. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) train_input_data = input_data.get_split("train").get_numpy_list( QUANTIZATION_DATA_NUM ) if quantization_type is not None: model = self._quantize_model( model, train_input_data, quantization_type, input_tfms ) self.compiled_model = self._compile_model(model) def _compile_model(self, model: Union[str, Path]): return model def _quantize_model( self, model_path: str, input_data: List[Tuple[np.ndarray, ...]], quantization_type: QuantizationType, input_tfms: MultiStageTransformation, ): return quantize_onnx( model_path, input_data, quantization_type, self.device, input_tfms ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/openvino.py ================================================ import subprocess from pathlib import Path from typing import Tuple, List, Union import numpy as np from nebullvm.config import QUANTIZATION_DATA_NUM from nebullvm.core.models import QuantizationType, ModelParams from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.quantizations.openvino import ( # noqa: E501 quantize_openvino, ) from nebullvm.operations.optimizations.compilers.quantizations.utils import ( check_quantization, ) from nebullvm.optional_modules.openvino import ( Core, CompiledModel, ) from nebullvm.tools.data import DataManager from nebullvm.tools.onnx import get_input_names from nebullvm.tools.transformations import MultiStageTransformation class OpenVINOCompiler(Compiler): supported_ops = { "cpu": [ None, QuantizationType.STATIC, QuantizationType.HALF, ], "gpu": [], } def __init__(self): super().__init__() def execute( self, model: Union[str, Path], model_params: ModelParams, input_tfms: MultiStageTransformation = None, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Compile the input model using OpenVINO library. Args: model (str): The onnx model path. model_params (ModelParams): The model parameters. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) train_input_data = input_data.get_split("train").get_numpy_list( QUANTIZATION_DATA_NUM ) cmd = [ "mo", "--input_model", str(model), "--output_dir", str(Path(model).parent), "--input", ",".join(get_input_names(str(model))), "--input_shape", ",".join([f"{list(shape)}" for shape in model_params.input_sizes]), ] if quantization_type is QuantizationType.DYNAMIC: return None if quantization_type is QuantizationType.HALF: cmd = cmd + ["--compress_to_fp16"] process = subprocess.Popen(cmd) process.wait() base_path = Path(model).parent openvino_model_path = base_path / f"{Path(model).stem}.xml" openvino_model_weights = base_path / f"{Path(model).stem}.bin" if quantization_type not in [QuantizationType.HALF, None]: openvino_model_path, openvino_model_weights = self._quantize_model( model_topology=str(openvino_model_path), model_weights=str(openvino_model_weights), input_names=get_input_names(str(model)), input_data=train_input_data, ) self.compiled_model = str( Path(openvino_model_path).parent / Path(openvino_model_path).stem ) def _compile_model( self, model_name: str, model_weights: str, network_parameters: ModelParams, ) -> CompiledModel: core = Core() model = core.read_model(model=model_name, weights=model_weights) dynamic_shape = self._get_dynamic_shape(model, network_parameters) if dynamic_shape is not None: model.reshape(dynamic_shape) return core.compile_model(model=model, device_name="CPU") @staticmethod def _quantize_model( model_topology: str, model_weights: str, input_data: List[Tuple[np.ndarray, ...]], input_names: List[str], ) -> Tuple[str, str]: return quantize_openvino( model_topology, model_weights, input_data, input_names ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/intel_neural_compressor.py ================================================ from pathlib import Path from tempfile import TemporaryDirectory from typing import Any import yaml from nebullvm.core.models import QuantizationType from nebullvm.optional_modules.neural_compressor import ( MixedPrecision, Quantization, ) from nebullvm.optional_modules.torch import DataLoader, Module, GraphModule from nebullvm.tools.data import DataManager, PytorchDataset from nebullvm.tools.transformations import ( MultiStageTransformation, HalfPrecisionTransformation, ) def _prepare_quantization_config(model: Any, tmp_dir: str, approach: str): config = { "model": { "name": model.__class__.__name__, "framework": "pytorch_fx", }, "quantization": {"approach": approach}, "evaluation": {"accuracy": {"metric": {"topk": 1}}}, "tuning": { "accuracy_criterion": {"relative": 0.01}, }, } path_file = Path(tmp_dir) / "temp_qt.yaml" with open(path_file, "w") as f: yaml.dump(config, f) return path_file def _prepare_mixed_precision_config(model: Any, tmp_dir: str): config = { "model": { "name": model.__class__.__name__, "framework": "pytorch_fx", }, "mixed_precision": {"precisions": "bf16"}, "evaluation": {"accuracy": {"metric": {"topk": 1}}}, "tuning": { "accuracy_criterion": {"relative": 0.01}, }, } path_file = Path(tmp_dir) / "temp_mp.yaml" with open(path_file, "w") as f: yaml.dump(config, f) return path_file def _get_dataloader(input_data: DataManager): bs = input_data[0][0][0].shape[0] ds = PytorchDataset(input_data, has_labels=True) dl = DataLoader(ds, bs) return dl def _quantize_static(model: Module, input_data: DataManager) -> GraphModule: with TemporaryDirectory() as tmp_dir: config_file_qt = _prepare_quantization_config( model, tmp_dir, "post_training_static_quant" ) quantizer = Quantization(str(config_file_qt)) quantizer.model = model quantizer.calib_dataloader = _get_dataloader(input_data) quantizer.eval_dataloader = _get_dataloader(input_data) compressed_model = quantizer() return compressed_model def _quantize_dynamic(model: Module) -> GraphModule: with TemporaryDirectory() as tmp_dir: config_file_qt = _prepare_quantization_config( model, tmp_dir, "post_training_dynamic_quant" ) quantizer = Quantization(str(config_file_qt)) quantizer.model = model compressed_model = quantizer() return compressed_model def _mixed_precision( model: Module, input_tfms: MultiStageTransformation ) -> GraphModule: with TemporaryDirectory() as tmp_dir: config_file_qt = _prepare_mixed_precision_config(model, tmp_dir) converter = MixedPrecision(str(config_file_qt)) converter.model = model compressed_model = converter() input_tfms.append(HalfPrecisionTransformation()) return compressed_model def quantize_neural_compressor( model: Module, quantization_type: QuantizationType, input_tfms: MultiStageTransformation, input_data: DataManager, ) -> GraphModule: if quantization_type is QuantizationType.STATIC: quantized_model = _quantize_static(model, input_data) elif quantization_type is QuantizationType.DYNAMIC: quantized_model = _quantize_dynamic(model) elif quantization_type is QuantizationType.HALF: quantized_model = _mixed_precision(model, input_tfms) else: raise ValueError( f"Quantization type {quantization_type} is not " f"supported by Intel Neural Compressor" ) return quantized_model ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/onnx.py ================================================ from pathlib import Path from typing import Union, Iterable, Tuple, List import cpuinfo import numpy as np from nebullvm.core.models import QuantizationType, Device, DeviceType from nebullvm.optional_modules.onnx import ( onnx, convert_float_to_float16_model_path, ) from nebullvm.optional_modules.onnxruntime import ( CalibrationDataReader, QuantType, quantize_dynamic, quantize_static, ) from nebullvm.optional_modules.torch import DataLoader from nebullvm.tools.onnx import get_input_names from nebullvm.tools.transformations import ( MultiStageTransformation, HalfPrecisionTransformation, ) class _IterableCalibrationDataReader(CalibrationDataReader): def __init__( self, iterable_dataset: Union[Iterable[Tuple], List[Tuple]], input_names: List[str], ): self.iterable_dataset = iter( [ { input_name: value for inputs in iterable_dataset for input_name, value in zip(input_names, inputs) } ] ) def get_next(self) -> dict: return next(self.iterable_dataset, None) @classmethod def from_dataloader( cls, dl: DataLoader, input_names: List[str], contains_y: bool = True ): iterable_ds = iter( inputs[:-1] if contains_y else inputs for inputs in dl ) return cls(iterable_ds, input_names) def _quantize_dynamic(model_path: str) -> str: model_path = Path(model_path) model_quant = model_path.parent.parent / "int8_dynamic" model_quant.mkdir(parents=True, exist_ok=True) model_quant = model_quant / (model_path.stem + ".quant.onnx") quantize_dynamic( model_path, model_quant, weight_type=QuantType.QUInt8, optimize_model=False, ) return str(model_quant) def _get_quantization_type_for_static(use_gpu) -> Tuple[QuantType, QuantType]: """Returns the quantization types for activations and weights, depending on the underlying hardware """ arch = cpuinfo.get_cpu_info()["arch"].lower() if use_gpu: activation_type = weight_type = QuantType.QInt8 elif "x86" in arch: cpu_raw_data = cpuinfo.get_cpu_info()["brand_raw"].lower() if "intel" in cpu_raw_data and "xeon" in cpu_raw_data: activation_type = QuantType.QUInt8 weight_type = QuantType.QInt8 else: activation_type = weight_type = QuantType.QUInt8 else: activation_type = QuantType.QUInt8 weight_type = QuantType.QUInt8 return activation_type, weight_type def _quantize_static( model_path: str, input_data: List[Tuple[np.ndarray, ...]], use_gpu: bool ) -> str: model_path = Path(model_path) model_quant = model_path.parent.parent / "int8_static" model_quant.mkdir(parents=True, exist_ok=True) model_quant = model_quant / (model_path.stem + ".quant.onnx") inputs = input_data input_names = get_input_names(str(model_path)) cdr = _IterableCalibrationDataReader( input_names=input_names, iterable_dataset=inputs ) activation_type, weight_type = _get_quantization_type_for_static(use_gpu) quantize_static( model_path, Path(model_quant), cdr, activation_type=activation_type, weight_type=weight_type, optimize_model=False, ) return str(model_quant) def _convert_to_half_precision( model_path: str, input_tfms: MultiStageTransformation ) -> str: model_path = Path(model_path) model_quant = model_path.parent.parent / "fp16" model_quant.mkdir(parents=True) model_quant = model_quant / (model_path.stem + "_fp16.onnx") new_onnx_model = convert_float_to_float16_model_path(str(model_path)) input_tfms.append(HalfPrecisionTransformation()) try: onnx.save(new_onnx_model, str(model_quant)) except ValueError: # Model larger than 2GB must be saved as external data onnx.save( new_onnx_model, str(model_quant), save_as_external_data=True, all_tensors_to_one_file=False, convert_attribute=True, ) return str(model_quant) def quantize_onnx( model_path: str, input_data: List[Tuple[np.ndarray, ...]], quantization_type: QuantizationType, device: Device, input_tfms: MultiStageTransformation, ) -> str: if quantization_type == QuantizationType.DYNAMIC: return _quantize_dynamic(model_path) elif quantization_type == QuantizationType.STATIC: return _quantize_static( model_path, input_data, device.type is DeviceType.GPU ) elif quantization_type == QuantizationType.HALF: return _convert_to_half_precision(model_path, input_tfms) else: raise ValueError( f"Quantization type {quantization_type} not supported" ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/openvino.py ================================================ from typing import List, Tuple, Any import numpy as np from nebullvm.optional_modules.openvino import ( DataLoader, load_model, IEEngine, create_pipeline, compress_model_weights, save_model, ) class _CalibrationDataLoader(DataLoader): def __init__( self, input_data: List[Tuple[Any, ...]], input_names: List[str] ): self._input_data = input_data self._input_names = input_names def __len__(self): return len(self._input_data) def __getitem__(self, item): inputs = { k: v for (k, v) in zip(self._input_names, self._input_data[item]) } return ( (item, None), inputs, ) def quantize_openvino( model_topology: str, model_weights: str, input_data: List[Tuple[np.ndarray, ...]], input_names: List[str], ) -> Tuple[str, str]: model_config = { "model_name": "model", "model": model_topology, "weights": model_weights, } # Engine config engine_config = {"device": "CPU"} algorithms = [ { "name": "DefaultQuantization", "params": { "target_device": "ANY", "preset": "performance", "stat_subset_size": len(input_data), }, } ] data_loader = _CalibrationDataLoader( input_data=input_data, input_names=input_names ) model = load_model(model_config=model_config) engine = IEEngine(config=engine_config, data_loader=data_loader) pipeline = create_pipeline(algorithms, engine) compressed_model = pipeline.run(model=model) compress_model_weights(compressed_model) compressed_model_paths = save_model( model=compressed_model, save_path="quantized_model", model_name="quantized_model", ) return ( compressed_model_paths[0]["model"], compressed_model_paths[0]["weights"], ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/pytorch.py ================================================ import copy from typing import List, Tuple, Union from loguru import logger from nebullvm.core.models import DeviceType, Device, QuantizationType from nebullvm.optional_modules.torch import ( torch, Module, symbolic_trace, QuantStub, DeQuantStub, GraphModule, default_dynamic_qconfig, prepare_fx, convert_fx, ScriptModule, ) from nebullvm.tools.transformations import ( MultiStageTransformation, HalfPrecisionTransformation, ) from nebullvm.tools.utils import check_module_version class _QuantWrapper(Module): def __init__(self, model: Module): super(_QuantWrapper, self).__init__() qconfig = model.qconfig if hasattr(model, "qconfig") else None self.quant = QuantStub(qconfig) self.model = model self.dequant = DeQuantStub() def forward(self, *inputs: torch.Tensor): inputs = (self.quant(x) for x in inputs) outputs = self.model(*inputs) return tuple(self.dequant(x) for x in outputs) def _quantize_dynamic_torch(model: Module): layer_types = { type(layer) for layer in model.children() if len(list(layer.parameters())) > 0 } return torch.quantization.quantize_dynamic( model=model, qconfig_spec=layer_types, dtype=torch.qint8 ) def _quantize_dynamic_torch_fx( model: GraphModule, input_data: List[Tuple[torch.Tensor, ...]], ): qconfig_dict = {"": default_dynamic_qconfig} additional_arguments = {} if check_module_version(torch, min_version="1.13.0"): additional_arguments["example_inputs"] = input_data[0] model_prepared = prepare_fx(model, qconfig_dict, **additional_arguments) return convert_fx(model_prepared) def _quantize_static_torch( model: Module, input_data: List[Tuple[torch.Tensor, ...]], backend: str, ): model = _QuantWrapper(model) model.qconfig = torch.quantization.get_default_qconfig(backend) # TODO: change line below, it's wrong # model = torch.quantization.fuse_modules(model, [["conv", "relu"]]) model = torch.quantization.prepare(model) with torch.no_grad(): for tensors in input_data: _ = model(*tensors) return torch.quantization.convert(model) def _quantize_static_torch_fx( model: GraphModule, input_data: List[Tuple[torch.Tensor, ...]], backend: str, ): qconfig_dict = {"": torch.quantization.get_default_qconfig(backend)} additional_arguments = {} if check_module_version(torch, min_version="1.13.0"): additional_arguments["example_inputs"] = input_data[0] model_prepared = prepare_fx(model, qconfig_dict, **additional_arguments) with torch.no_grad(): for tensors in input_data: _ = model_prepared(*tensors) return convert_fx(model_prepared) def _quantize_static( model: Union[Module, GraphModule], input_data: List[Tuple[torch.Tensor, ...]], device: Device, ): assert ( device is not DeviceType.GPU ), "Quantization for torch is only available on CPU" backend = ( "fbgemm" if "fbgemm" in torch.backends.quantized.supported_engines else "qnnpack" ) torch.backends.quantized.engine = backend if isinstance(model, GraphModule): return _quantize_static_torch_fx(model, input_data, backend) else: return _quantize_static_torch(model, input_data, backend) def _quantize_dynamic( model: Union[Module, GraphModule], input_data: List[Tuple[torch.Tensor, ...]], device: Device, ): assert ( device is not DeviceType.GPU ), "Quantization for torch is only available on CPU" backend = ( "fbgemm" if "fbgemm" in torch.backends.quantized.supported_engines else "qnnpack" ) torch.backends.quantized.engine = backend if isinstance(model, GraphModule): return _quantize_dynamic_torch_fx(model, input_data) else: return _quantize_dynamic_torch(model) def _half_precision(model: Module): return model.half() def quantize_pytorch( model: Module, quantization_type: QuantizationType, input_tfms: MultiStageTransformation, input_data_torch: List[Tuple[torch.Tensor, ...]], device: Device, ) -> Union[torch.nn.Module, ScriptModule, GraphModule]: model = copy.deepcopy(model).eval() try: model = symbolic_trace(model) except Exception: logger.warning("Unable to trace model with torch.fx") if quantization_type is QuantizationType.HALF: input_tfms.append(HalfPrecisionTransformation()) quantized_model = _half_precision(model) elif quantization_type is QuantizationType.STATIC: quantized_model = _quantize_static(model, input_data_torch, device) elif quantization_type is QuantizationType.DYNAMIC: quantized_model = _quantize_dynamic(model, input_data_torch, device) else: raise NotImplementedError( f"No quantization implemented for quantization " f"type {quantization_type}" ) return quantized_model ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tensor_rt.py ================================================ from typing import List, Tuple import numpy as np from nebullvm.core.models import QuantizationType, ModelParams from nebullvm.optional_modules.tensor_rt import ( tensorrt as trt, IInt8EntropyCalibrator2, polygraphy, ) from nebullvm.tools.transformations import ( MultiStageTransformation, ) def quantize_tensorrt( quantization_type: QuantizationType, model_params: ModelParams, config, input_tfms: MultiStageTransformation, input_data: List[Tuple[np.ndarray, ...]] = None, ): if quantization_type is QuantizationType.HALF: config.set_flag(trt.BuilderFlag.FP16) # Tensor RT does not need to transform input data # to fp16 because it expects always fp32 elif quantization_type is QuantizationType.STATIC: assert input_data is not None, ( "You need to specify the calibration data for " "performing static quantization." ) calibrator = TensorRTCalibrator( batch_size=model_params.batch_size, input_data=input_data, ) config.set_flag(trt.BuilderFlag.INT8) config.int8_calibrator = calibrator return config class TensorRTCalibrator(IInt8EntropyCalibrator2): def __init__( self, batch_size: int, input_data: List[Tuple[np.ndarray, ...]] ): super(TensorRTCalibrator, self).__init__() self._bs = batch_size self.batches = (x for x in input_data) def get_batch(self, names): cuda_stream = polygraphy.Stream() try: data = next(self.batches) cuda_data = [] for input_tensor in data: device_array = polygraphy.DeviceArray( shape=input_tensor.shape, dtype=input_tensor.dtype ) device_array.copy_from( host_buffer=input_tensor, stream=cuda_stream ) cuda_data.append(device_array) return [input_tensor.ptr for input_tensor in cuda_data] except StopIteration: return None def get_batch_size(self): return self._bs def read_calibration_cache(self): return None def write_calibration_cache(self, cache): return None ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tensorflow.py ================================================ from typing import List, Tuple from nebullvm.core.models import QuantizationType from nebullvm.optional_modules.tensorflow import tensorflow as tf def _quantize_dynamic(model: tf.Module): converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.optimizations = [tf.lite.Optimize.DEFAULT] tflite_quant_model = converter.convert() return tflite_quant_model def _quantize_static(model: tf.Module, dataset: List[Tuple[tf.Tensor, ...]]): def representative_dataset(): for data_tuple in dataset: yield list(data_tuple) converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.representative_dataset = representative_dataset tflite_quant_model = converter.convert() return tflite_quant_model def _half_precision(model: tf.Module): converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.target_spec.supported_types = [tf.float16] tflite_quant_model = converter.convert() return tflite_quant_model def quantize_tensorflow( model: tf.Module, quantization_type: QuantizationType, input_data_tensorflow: List[Tuple[tf.Tensor, ...]], ): if quantization_type is QuantizationType.DYNAMIC: quantized_model = _quantize_dynamic(model) elif quantization_type is QuantizationType.STATIC: quantized_model = _quantize_static(model, input_data_tensorflow) elif quantization_type is QuantizationType.HALF: quantized_model = _half_precision(model) else: raise NotImplementedError( f"Quantization not supported for type {quantization_type}" ) return quantized_model ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tvm.py ================================================ from typing import List, Sequence, Any from nebullvm.config import QUANTIZATION_DATA_NUM from nebullvm.core.models import QuantizationType from nebullvm.optional_modules.tvm import ( relay, ToMixedPrecision, ) from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import ( MultiStageTransformation, HalfPrecisionTransformation, ) class TVMCalibrator(DataManager): def __init__(self, data_reader: Sequence, input_names: List[str]): super(TVMCalibrator, self).__init__(data_reader=data_reader) self._input_names = input_names def __getitem__(self, item: int): tuple_ = self._data_reader[item] return {name: data for name, data in zip(self._input_names, tuple_)} def quantize_apache_tvm( model: Any, quantization_type: QuantizationType, input_tfms: MultiStageTransformation, input_data: DataManager, params: Any, ): if quantization_type is not None: if quantization_type is QuantizationType.HALF: quantized_model = ToMixedPrecision(mixed_precision_type="float16")( model ) input_tfms.append(HalfPrecisionTransformation()) else: if quantization_type is QuantizationType.DYNAMIC: inputs = None elif quantization_type is QuantizationType.STATIC: inputs = input_data.get_split("train").get_numpy_list( QUANTIZATION_DATA_NUM ) input_names = [f"input_{n}" for n in range(len(inputs[0]))] inputs = TVMCalibrator(inputs, input_names) else: return if inputs is not None: with relay.quantize.qconfig( calibrate_mode="kl_divergence", weight_scale="max" ): quantized_model = relay.quantize.quantize( model, params, dataset=inputs ) else: with relay.quantize.qconfig( calibrate_mode="global_scale", global_scale=8.0 ): quantized_model = relay.quantize.quantize(model, params) return quantized_model ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/utils.py ================================================ from loguru import logger from nebullvm.core.models import QuantizationType def check_quantization( quantization_type: QuantizationType, perf_loss_ths: float ): if quantization_type is not None and perf_loss_ths is None: logger.warning( "Got a valid quantization type without any given quantization " "threshold. The quantization step will be ignored." ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tensor_rt.py ================================================ import abc import copy import os import subprocess from pathlib import Path from typing import List, Any, Tuple import numpy as np from nebullvm.config import QUANTIZATION_DATA_NUM, TORCH_TENSORRT_PRECISIONS from nebullvm.core.models import QuantizationType, ModelParams from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.quantizations.tensor_rt import ( # noqa: E501 quantize_tensorrt, ) from nebullvm.operations.optimizations.compilers.quantizations.utils import ( check_quantization, ) from nebullvm.optional_modules.onnx import onnx from nebullvm.optional_modules.tensor_rt import tensorrt as trt from nebullvm.optional_modules.torch import torch, Module from nebullvm.optional_modules.torch_tensorrt import ( torch_tensorrt, DataLoaderCalibrator, ) from nebullvm.tools.data import DataManager, PytorchDataset from nebullvm.tools.diffusers import UNet from nebullvm.tools.onnx import get_input_names from nebullvm.tools.transformations import ( MultiStageTransformation, HalfPrecisionTransformation, ) class TensorRTCompiler(Compiler, abc.ABC): supported_ops = { "cpu": [], "gpu": [ None, QuantizationType.STATIC, QuantizationType.HALF, ], } def __init__(self): super().__init__() self.model_orig = None @staticmethod def _extract_dynamic_shape_ranges(model_params: ModelParams): inputs_shapes = [] for i, info in enumerate(model_params.input_infos): static_shape = info.size if model_params.dynamic_info is not None: input_dict = model_params.dynamic_info.inputs[i] assert all( key in dim for dim in input_dict.values() for key in ["min_val", "opt_val", "max_val"] ), ( "Missing min/opt/max ranges, TensorRT needs them to " "enable dynamic shape properly" ) shape_dict = { "min_shape": [ static_shape[j] if j not in input_dict else input_dict[j]["min_val"] for j in range(len(static_shape)) ], "opt_shape": [ static_shape[j] if j not in input_dict else input_dict[j]["opt_val"] for j in range(len(static_shape)) ], "max_shape": [ static_shape[j] if j not in input_dict else input_dict[j]["max_val"] for j in range(len(static_shape)) ], } inputs_shapes.append(shape_dict) else: inputs_shapes.append({"shape": static_shape}) return inputs_shapes @abc.abstractmethod def execute(self, *args, **kwargs): pass class PyTorchTensorRTCompiler(TensorRTCompiler): def execute( self, model: Module, model_params: ModelParams, input_tfms: MultiStageTransformation = None, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Compile the input model using TensorRT Compiler from the PyTorch interface. Args: model (torch.nn.Module): The pytorch model. model_params (ModelParams): The model parameters. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) if quantization_type is QuantizationType.HALF: dtype = torch.half input_tfms.append(HalfPrecisionTransformation()) elif quantization_type is QuantizationType.STATIC: if model_params.dynamic_info is not None: self.logger.warning( "Static quantization is not available when " "using dynamic shape" ) return dtype = torch.int8 dataset = PytorchDataset(input_data.get_split("train")) dataloader = torch.utils.data.DataLoader( dataset, batch_size=dataset.batch_size, shuffle=False, num_workers=0, ) calibrator = torch_tensorrt.ptq.DataLoaderCalibrator( dataloader, use_cache=False, algo_type=torch_tensorrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2, # noqa E501 device=torch.device(self.device.to_torch_format()), ) else: dtype = torch.float32 # Convert int64 to int32 for transformers inputs input_tensors = [ tensor.to(self.device.to_torch_format()) if tensor.dtype != torch.int64 else tensor.to(torch.int32).to(self.device.to_torch_format()) for tensor in input_data.get_list(1)[0] ] self.compiled_model = self._compile_model( model=model, model_params=model_params, input_tensors=input_tensors, dtype=dtype, calibrator=calibrator if quantization_type is QuantizationType.STATIC else None, # noqa E501 quantization_type=quantization_type, ) @torch.no_grad() def _compile_model( self, model: Module, model_params: ModelParams, input_tensors: List[torch.Tensor], dtype: torch.dtype, calibrator: DataLoaderCalibrator, quantization_type: QuantizationType, ): model.to(self.device.to_torch_format()).eval() try: if quantization_type is QuantizationType.HALF: ts_model = torch.jit.script(copy.deepcopy(model).half()).half() else: ts_model = torch.jit.script(model) except Exception: if quantization_type is QuantizationType.HALF: ts_model = torch.jit.trace( copy.deepcopy(model).half(), [t.half() for t in input_tensors], ).half() else: ts_model = torch.jit.trace(model, input_tensors) with torch_tensorrt.logging.errors(): inputs_shapes = self._extract_dynamic_shape_ranges(model_params) trt_model = torch_tensorrt.compile( ts_model, inputs=[ torch_tensorrt.Input( **inputs_shapes[i], dtype=torch.half if ( dtype == torch.half and tensor.dtype not in [torch.int8, torch.int32] ) else tensor.dtype, ) for i, tensor in enumerate(input_tensors) ], enabled_precisions=TORCH_TENSORRT_PRECISIONS[str(dtype)], calibrator=calibrator if quantization_type is QuantizationType.STATIC else None, workspace_size=self.device.get_free_memory(), device={ "device_type": torch_tensorrt.DeviceType.GPU, "gpu_id": self.device.idx, "dla_core": 0, "allow_gpu_fallback": False, "disable_tf32": False, }, truncate_long_and_double=True, ) # Delete calibration cache if os.path.exists("calibration.cache"): os.remove("calibration.cache") return trt_model @staticmethod def _quantize_model(**kwargs) -> Any: raise NotImplementedError class ONNXTensorRTCompiler(TensorRTCompiler): def __init__(self): super().__init__() self.model_orig = None self.onnx_model_path = None self.simplify_model = True def execute( self, model: str, model_params: ModelParams, input_tfms: MultiStageTransformation = None, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, is_diffusion: bool = False, **kwargs, ): """Compile the input model using TensorRT Compiler from the ONNX interface. Args: model (str): The path to the onnx model. model_params (ModelParams): The model parameters. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None is_diffusion (bool): Whether the model is a diffusion model. Default: False. """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) train_input_data = input_data.get_split("train").get_numpy_list( QUANTIZATION_DATA_NUM ) if self.simplify_model and not is_diffusion: try: import onnxsim # noqa: F401 # Simplify model, otherwise tensor RT won't work # on gpt2 and some other models. simplified_model = str(model) + "_simplified" if not Path(simplified_model).is_file(): cmd = [ "onnxsim", str(model), simplified_model, ] subprocess.run(cmd, stdout=subprocess.DEVNULL) # First try with simplified model self.onnx_model_path = simplified_model assert os.path.isfile(self.onnx_model_path) except Exception: # Use original model self.logger.warning( "Unable to simplify model with ONNX Simplifier. " "Original ONNX model will be used to build " "TensorRT engine" ) self.onnx_model_path = str(model) self.simplify_model = False elif self.onnx_model_path is None: self.onnx_model_path = str(model) if is_diffusion: if quantization_type is None: self.logger.warning( "Skipping float32 precision for Stable Diffusion, " "half precision will be used instead." ) return if quantization_type is QuantizationType.STATIC: self.logger.warning( "Skipping static quantization for Stable Diffusion " "because for now it's not supported." ) return if self.simplify_model and is_diffusion: optimized_model = str(Path(model).parent / "model_opt.onnx") unet = UNet(hf_token=None) opt_graph = unet.optimize(onnx.load(str(model))) try: onnx.save(opt_graph, optimized_model) except Exception: onnx.save( opt_graph, optimized_model, save_as_external_data=True ) self.onnx_model_path = optimized_model self.simplify_model = False elif self.onnx_model_path is None: self.onnx_model_path = str(model) # -- Build phase -- nvidia_logger = trt.Logger(trt.Logger.ERROR) builder = trt.Builder(nvidia_logger) # create network definition network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) ) # build the engine # TODO: setup config value for the class in a config file config = builder.create_builder_config() try: config.set_memory_pool_limit( trt.MemoryPoolType.WORKSPACE, self.device.get_free_memory() ) except AttributeError: # The method set_memory_pool_limit is not available # until TensorRT Release 8.4.1 self.logger.warning( "Cannot call method set_memory_pool_limit for TensorRT. " "because your version is lower than 8.4.1. " "Please update TensorRT version." ) if quantization_type is not None: config = self._quantize_model( quantization_type, model_params, config, input_tfms, train_input_data if quantization_type is QuantizationType.STATIC else None, ) self.compiled_model = self._compile_model( onnx_model_path=str(self.onnx_model_path), model_params=model_params, config=config, network=network, builder=builder, nvidia_logger=nvidia_logger, ) self.model_orig = self.onnx_model_path def _compile_model( self, onnx_model_path: str, model_params: ModelParams, config, network, builder, nvidia_logger, ): parser = trt.OnnxParser(network, nvidia_logger) success = parser.parse_from_file(onnx_model_path) if not success: for idx in range(parser.num_errors): self.logger.debug(parser.get_error(idx)) raise ValueError( f"Errors occurred while processing the " f"ONNX file at {onnx_model_path}" ) if model_params.dynamic_info is not None: inputs_shapes = self._extract_dynamic_shape_ranges(model_params) profile = builder.create_optimization_profile() for i, input_name in enumerate(get_input_names(onnx_model_path)): profile.set_shape( input_name, inputs_shapes[i]["min_shape"], inputs_shapes[i]["opt_shape"], inputs_shapes[i]["max_shape"], ) config.add_optimization_profile(profile) return builder.build_serialized_network(network, config) @staticmethod def _quantize_model( quantization_type: QuantizationType, model_params: ModelParams, config, input_tfms: MultiStageTransformation, input_data: List[Tuple[np.ndarray, ...]] = None, ): return quantize_tensorrt( quantization_type, model_params, config, input_tfms, input_data, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tensorflow.py ================================================ from typing import List, Tuple from nebullvm.config import QUANTIZATION_DATA_NUM from nebullvm.core.models import QuantizationType from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.quantizations.tensorflow import ( # noqa: E501 quantize_tensorflow, ) from nebullvm.operations.optimizations.compilers.quantizations.utils import ( check_quantization, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation class TensorflowBackendCompiler(Compiler): supported_ops = { "cpu": [None], "gpu": [None], } def execute( self, model: tf.Module, input_tfms: MultiStageTransformation = None, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Optimize the input model using tensorflow built-in techniques. Args: model (tf.Module): The tensorflow model. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None. """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) self.compiled_model = model def _compile_model(self): pass @staticmethod def _quantize_model(**kwargs): raise NotImplementedError() class TFLiteBackendCompiler(Compiler): supported_ops = { "cpu": [ None, QuantizationType.STATIC, QuantizationType.HALF, QuantizationType.DYNAMIC, ], "gpu": [], } def execute( self, model: tf.Module, input_tfms: MultiStageTransformation, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Optimize the input model using pytorch built-in techniques. Args: model (torch.nn.Module): The pytorch model. For avoiding un-wanted modifications to the original model, it will be copied in the method. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with an higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) train_input_data = input_data.get_split("train").get_list( QUANTIZATION_DATA_NUM ) if quantization_type is not None: self.compiled_model = self._quantize_model( model, quantization_type, train_input_data ) else: self.compiled_model = self._compile_model(model) def _compile_model( self, model: tf.Module, ): converter = tf.lite.TFLiteConverter.from_keras_model(model) tflite_model = converter.convert() return tflite_model @staticmethod def _quantize_model( model: tf.Module, quantization_type: QuantizationType, input_data_tensorflow: List[Tuple[tf.Tensor, ...]], ): return quantize_tensorflow( model, quantization_type, input_data_tensorflow ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_dynamo.py ================================================ from typing import Union, Any from nebullvm.core.models import ModelParams, QuantizationType from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.optional_modules.torch import ( torch, Module, GraphModule, ) from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation class TorchDynamoCompiler(Compiler): supported_ops = { "cpu": [None], "gpu": [None], } def execute( self, model: Module, model_params: ModelParams, input_tfms: MultiStageTransformation = None, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Optimize the input model using pytorch built-in techniques. Args: model (torch.nn.Module): The pytorch model. model_params (ModelParams): The model parameters. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None. """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) self.compiled_model = self._compile_model(model, model_params) @torch.no_grad() def _compile_model( self, model: Union[Module, GraphModule], network_parameters: ModelParams, ) -> Any: dynamic = False if network_parameters.dynamic_info is not None: dynamic = True return torch.compile(model, dynamic=dynamic) def _quantize_model(self, **kwargs) -> Any: raise NotImplementedError ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_neuron.py ================================================ from typing import List, Tuple from nebullvm.core.models import QuantizationType, ModelParams, DeviceType from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.quantizations.utils import ( check_quantization, ) from nebullvm.optional_modules.torch import ( torch, symbolic_trace, ) from nebullvm.optional_modules.torch_neuron import torch_neuron from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation class TorchNeuronCompiler(Compiler): supported_ops = { "cpu": [], "gpu": [], "neuron": [None, QuantizationType.HALF], } @staticmethod def _check_dynamic_shape(network_parameters: ModelParams) -> bool: """Handles case when model inputs have dynamic shapes. For now TorchNeuron only supports dynamic shape for the batch dimension. Args: network_parameters (ModelParams): The model parameters. Returns: bool: True if the model has dynamic batch size, False otherwise. """ if network_parameters.dynamic_info is None: return False for i, input_shape in enumerate( network_parameters.dynamic_info.inputs ): if len(input_shape) > 1 or ( len(input_shape) == 1 and input_shape.get(0) is None ): raise ValueError( f"TorchNeuronCompiler only supports dynamic shapes for " f"batch dimension. Provided dynamic info for input {i} " f"is: {input_shape}. Please use padding for the other " f"dimensions." ) return True def execute( self, model: torch.nn.Module, model_params: ModelParams, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Optimize the input model using pytorch built-in techniques. Args: model (torch.nn.Module): The pytorch model. model_params (ModelParams): The model parameters. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None. """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) dynamic_batch_size = self._check_dynamic_shape(model_params) self.compiled_model = self._compile_model( model, input_data, quantization_type, dynamic_batch_size=dynamic_batch_size, ) @torch.no_grad() def _compile_model( self, model: torch.nn.Module, input_data: DataManager, quantization_type: QuantizationType, dynamic_batch_size: bool, ) -> torch.jit.ScriptModule: input_sample = input_data.get_list(1)[0] if self.device.type is DeviceType.GPU: if quantization_type is QuantizationType.HALF: input_sample = [ t.to(self.device.to_torch_format()).half() if torch.is_floating_point(t) else t.to(self.device.to_torch_format()) for t in input_sample ] else: input_sample = [ t.to(self.device.to_torch_format()) for t in input_sample ] model.to(self.device.to_torch_format()) model.eval() try: model_scripted = symbolic_trace(model) model_scripted = torch_neuron.trace( model_scripted, input_sample, dynamic_batch_size=dynamic_batch_size, compiler_args=["--fast-math", "none"] if quantization_type is None else None, ) except Exception: try: model_scripted = torch_neuron.trace( model, input_sample, dynamic_batch_size=dynamic_batch_size, compiler_args=["--fast-math", "none"] if quantization_type is None else None, ) except Exception: raise RuntimeError("Unable to trace model with torch_neuron.") return model_scripted @torch.no_grad() def _quantize_model( self, model: torch.nn.Module, quantization_type: QuantizationType, input_tfms: MultiStageTransformation, input_data_torch: List[Tuple[torch.Tensor, ...]], ): raise NotImplementedError() ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_xla.py ================================================ from nebullvm.core.models import QuantizationType from nebullvm.operations.optimizations.compilers.torchscript import ( TorchScriptCompiler, ) from nebullvm.optional_modules.torch import ( torch, ) from nebullvm.tools.data import DataManager class TorchXLACompiler(TorchScriptCompiler): supported_ops = { "cpu": [], "gpu": [], "tpu": [None, QuantizationType.HALF], } @torch.no_grad() def _compile_model( self, model: torch.nn.Module, input_data: DataManager, quantization_type: QuantizationType, ) -> torch.nn.Module: compiled_model = model.to(self.device.to_torch_format()) return compiled_model ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torchscript.py ================================================ from typing import Union, List, Tuple from nebullvm.config import QUANTIZATION_DATA_NUM from nebullvm.core.models import QuantizationType, DeviceType from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.quantizations.pytorch import ( quantize_pytorch, ) from nebullvm.operations.optimizations.compilers.quantizations.utils import ( check_quantization, ) from nebullvm.optional_modules.torch import ( torch, Module, ScriptModule, GraphModule, symbolic_trace, ) from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation class TorchScriptCompiler(Compiler): supported_ops = { "cpu": [None, QuantizationType.STATIC, QuantizationType.DYNAMIC], "gpu": [ None, QuantizationType.HALF, ], } def execute( self, model: Module, input_tfms: MultiStageTransformation = None, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Optimize the input model using pytorch built-in techniques. Args: model (torch.nn.Module): The pytorch model. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None. """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) train_input_data = input_data.get_split("train").get_list( QUANTIZATION_DATA_NUM ) if quantization_type is not None: model = self._quantize_model( model, quantization_type, input_tfms, train_input_data ) self.compiled_model = self._compile_model( model, input_data, quantization_type ) @torch.no_grad() def _compile_model( self, model: Union[Module, GraphModule], input_data: DataManager, quantization_type: QuantizationType, ) -> ScriptModule: input_sample = input_data.get_list(1)[0] if self.device.type is DeviceType.GPU: if quantization_type is QuantizationType.HALF: input_sample = [ t.to(self.device.to_torch_format()).half() if torch.is_floating_point(t) else t.to(self.device.to_torch_format()) for t in input_sample ] else: input_sample = [ t.to(self.device.to_torch_format()) for t in input_sample ] model.to(self.device.to_torch_format()) if not isinstance(model, torch.fx.GraphModule): model.eval() try: model_scripted = symbolic_trace(model) model_scripted = torch.jit.script(model_scripted) except Exception: if quantization_type is None: self.logger.warning("Unable to trace model with torch.fx") try: model_scripted = torch.jit.script(model) except Exception: model_scripted = torch.jit.trace(model, input_sample) else: model_scripted = torch.jit.script(model) return model_scripted @torch.no_grad() def _quantize_model( self, model: Module, quantization_type: QuantizationType, input_tfms: MultiStageTransformation, input_data_torch: List[Tuple[torch.Tensor, ...]], ): return quantize_pytorch( model, quantization_type, input_tfms, input_data_torch, self.device ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tvm.py ================================================ import abc import os import uuid from abc import ABC from typing import Any, Tuple, Dict, Union from nebullvm.config import ( AUTO_TVM_PARAMS, AUTO_TVM_TUNING_OPTION, ) from nebullvm.core.models import ( QuantizationType, ModelParams, DeviceType, Device, ) from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.quantizations.tvm import ( TVMCalibrator, quantize_apache_tvm, ) from nebullvm.operations.optimizations.compilers.quantizations.utils import ( check_quantization, ) from nebullvm.optional_modules.onnx import onnx from nebullvm.optional_modules.torch import Module, torch from nebullvm.optional_modules.tvm import ( tvm, IRModule, NDArray, XGBTuner, autotvm, relay, ExecutorFactoryModule, ) from nebullvm.tools.data import DataManager from nebullvm.tools.onnx import get_input_names from nebullvm.tools.pytorch import create_model_inputs_torch from nebullvm.tools.transformations import MultiStageTransformation class ApacheTVMCompiler(Compiler, ABC): supported_ops = { "cpu": [ None, # QuantizationType.STATIC, QuantizationType.HALF, QuantizationType.DYNAMIC, ], "gpu": [ None, # QuantizationType.STATIC, QuantizationType.HALF, QuantizationType.DYNAMIC, ], } def __init__(self): super().__init__() self.model_orig = None def execute( self, model: Union[Module, str], input_tfms: MultiStageTransformation, model_params: ModelParams, metric_drop_ths: float = None, quantization_type: QuantizationType = None, input_data: DataManager = None, **kwargs, ): """Compile the input model using Apache TVM compiler. Args: model (Union[Module, str]: The input model. Can be a torch model or a path to an onnx model. input_tfms (MultiStageTransformation, optional): Transformations to be performed to the model's input tensors in order to get the prediction. Default: None. model_params (ModelParams): Model parameters. metric_drop_ths (float, optional): Threshold for the accepted drop in terms of precision. Any optimized model with a higher drop will be ignored. Default: None. quantization_type (QuantizationType, optional): The desired quantization algorithm to be used. Default: None. input_data (DataManager): User defined data. Default: None """ if quantization_type not in self.supported_ops[self.device.type.value]: self.compiled_model = None return if quantization_type is QuantizationType.STATIC and input_data is None: raise ValueError("Input data is required for static quantization.") self.logger.info( f"Optimizing with {self.__class__.__name__} and " f"q_type: {quantization_type}." ) check_quantization(quantization_type, metric_drop_ths) mod, params = self._build_tvm_model(model, model_params) if quantization_type is not None: mod = self._quantize_model( mod, quantization_type, input_tfms, input_data, params ) self.compiled_model = self._compile_model(mod, params) @abc.abstractmethod def _build_tvm_model(self, model: Any, model_params: ModelParams): raise NotImplementedError() @staticmethod def _build_tvm_model_from_torch( torch_model: Module, model_params: ModelParams, device: Device ) -> Tuple[IRModule, Dict[str, NDArray]]: shape_dict = { f"input_{i}": input_size for i, input_size in enumerate(model_params.input_sizes) } inputs = tuple(create_model_inputs_torch(model_params.input_infos)) if device.type is not DeviceType.GPU: inputs = tuple(input_.cpu() for input_ in inputs) torch_model.cpu() else: inputs = tuple( input_.to(device.to_torch_format()) for input_ in inputs ) torch_model.to(device.to_torch_format()) with torch.no_grad(): _ = torch_model(*inputs) model_trace = torch.jit.trace(torch_model, inputs) model_trace.eval() mod, params = relay.frontend.from_pytorch( model_trace, list(shape_dict.items()) ) return mod, params @staticmethod def _build_tvm_model_from_onnx( onnx_model_path: str, model_params: ModelParams ) -> Tuple[IRModule, Dict[str, NDArray]]: shape_dict = { input_key: input_size for input_key, input_size in zip( get_input_names(onnx_model_path), model_params.input_sizes ) } onnx_model = onnx.load(onnx_model_path) mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) return mod, params @staticmethod def _quantize( mod: IRModule, params: Dict[str, NDArray], input_data: TVMCalibrator = None, ) -> IRModule: if input_data is not None: with relay.quantize.qconfig( calibrate_mode="kl_divergence", weight_scale="max" ): mod = relay.quantize.quantize(mod, params, dataset=input_data) else: with relay.quantize.qconfig( calibrate_mode="global_scale", global_scale=8.0 ): mod = relay.quantize.quantize(mod, params) return mod @staticmethod def _get_target(device) -> str: if device.type is DeviceType.GPU: return str(tvm.target.cuda()) else: return "llvm" # run on CPU @staticmethod def _tune_tvm_model( target: str, mod: IRModule, params: Dict[str, NDArray] ) -> str: """Tune the model using AutoTVM.""" # TODO: add support to Ansor tuning_records = f"{uuid.uuid4()}_model_records.json" # create a TVM runner runner = autotvm.LocalRunner( number=AUTO_TVM_PARAMS["number"], repeat=AUTO_TVM_PARAMS["repeat"], timeout=AUTO_TVM_PARAMS["timeout"], min_repeat_ms=AUTO_TVM_PARAMS["min_repeat_ms"], # TODO modify min_repeat_ms for GPU usage enable_cpu_cache_flush=True, ) # begin by extracting the tasks from the onnx model tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params ) # Tune the extracted tasks sequentially. for i, task in enumerate(tasks): tuner_obj = XGBTuner(task, loss_type="rank") tuner_obj.tune( n_trial=min( AUTO_TVM_TUNING_OPTION["trials"], len(task.config_space) ), early_stopping=AUTO_TVM_TUNING_OPTION["early_stopping"], measure_option=autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner, ), callbacks=[ autotvm.callback.log_to_file(tuning_records), ], ) return tuning_records def _compile_model(self, model: Any, params: Any) -> ExecutorFactoryModule: target = self._get_target(self.device) tuning_records = self._tune_tvm_model(target, model, params) with autotvm.apply_history_best(tuning_records): with tvm.transform.PassContext(opt_level=3, config={}): lib = relay.build(model, target=target, params=params) # Remove temporary file created by tvm os.remove(tuning_records) return lib @staticmethod def _quantize_model( model: Any, quantization_type: QuantizationType, input_tfms: MultiStageTransformation, input_data: DataManager, params, ): return quantize_apache_tvm( model, quantization_type, input_tfms, input_data, params ) class PyTorchApacheTVMCompiler(ApacheTVMCompiler): def _build_tvm_model(self, model: Any, model_params: ModelParams): return self._build_tvm_model_from_torch( model, model_params, self.device ) class ONNXApacheTVMCompiler(ApacheTVMCompiler): def _build_tvm_model(self, model: Any, model_params: ModelParams): self.model_orig = model return self._build_tvm_model_from_onnx(model, model_params) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/utils.py ================================================ from pathlib import Path import nebullvm from nebullvm.core.models import Device, ModelCompiler, DeviceType def onnxruntime_is_available() -> bool: try: import onnxruntime # noqa F401 return True except ImportError: return False def tvm_is_available() -> bool: try: import tvm # noqa F401 from tvm.runtime import Module # noqa F401 return True except ImportError: return False def bladedisc_is_available() -> bool: try: import torch_blade # noqa F401 return True except ImportError: return False def tensorrt_is_available() -> bool: try: import polygraphy # noqa F401 import tensorrt # noqa F401 return True except ImportError: return False def torch_tensorrt_is_available() -> bool: try: import torch_tensorrt # noqa F401 return True except ImportError: return False def openvino_is_available() -> bool: try: from openvino.runtime import Core # noqa F401 except ImportError: return False else: return True def deepsparse_is_available() -> bool: try: import deepsparse # noqa F401 except ImportError: return False else: return True def intel_neural_compressor_is_available() -> bool: try: import neural_compressor # noqa F401 except ImportError: return False else: return True def torch_xla_is_available(): try: import torch_xla # noqa F401 return True except ImportError: return False def torch_neuron_is_available(): try: import torch_neuron # noqa F401 return True except ImportError: return False def get_faster_transformer_repo_path() -> Path: return Path(nebullvm.__file__).parent.joinpath("FasterTransformer") def faster_transformer_is_available() -> bool: return ( get_faster_transformer_repo_path() .parent.joinpath("FasterTransformer_build_success") .exists() ) def select_compilers_from_hardware_onnx(device: Device): from nebullvm.optional_modules.utils import onnx_is_available compilers = [] if onnx_is_available(): if onnxruntime_is_available(): compilers.append(ModelCompiler.ONNX_RUNTIME) if tvm_is_available(): compilers.append(ModelCompiler.APACHE_TVM) if device.type is DeviceType.GPU and tensorrt_is_available(): compilers.append(ModelCompiler.TENSOR_RT) if device.type is DeviceType.CPU and openvino_is_available(): compilers.append(ModelCompiler.OPENVINO) return compilers def select_compilers_from_hardware_torch(device: Device): from nebullvm.optional_modules.utils import torch_is_available compilers = [] if torch_is_available(): compilers.append(ModelCompiler.TORCHSCRIPT) if tvm_is_available(): compilers.append(ModelCompiler.APACHE_TVM) if bladedisc_is_available(): compilers.append(ModelCompiler.BLADEDISC) if torch_neuron_is_available(): compilers.append(ModelCompiler.TORCH_NEURON) if device.type is DeviceType.CPU: if deepsparse_is_available(): compilers.append(ModelCompiler.DEEPSPARSE) if intel_neural_compressor_is_available(): compilers.append(ModelCompiler.INTEL_NEURAL_COMPRESSOR) elif device.type is DeviceType.GPU: if torch_tensorrt_is_available: compilers.append(ModelCompiler.TENSOR_RT) return compilers def select_compilers_from_hardware_tensorflow(): from nebullvm.optional_modules.utils import tensorflow_is_available compilers = [] if tensorflow_is_available(): compilers.append(ModelCompiler.XLA) compilers.append(ModelCompiler.TFLITE) return compilers ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/base.py ================================================ from abc import ABC, abstractmethod from typing import Any, Optional, Dict, Callable, Tuple import yaml from nebullvm.operations.base import Operation from nebullvm.tools.data import DataManager class Compressor(Operation, ABC): def __init__(self, config_file: str = None): super().__init__() self._config = self._read_config(config_file) self.compressed_model = None self.new_metric_ths = None @abstractmethod def execute( self, model: Any, train_input_data: DataManager, eval_input_data: DataManager, metric_drop_ths: float, metric: Callable, ) -> Tuple[Any, Optional[float]]: raise NotImplementedError() def _read_config(self, config_file: Optional[str]) -> Dict: config = self._get_default_config() if config_file is not None: with open(config_file, "r") as f: data = yaml.load(f, Loader=yaml.CLoader) config.update(data.get(self.config_key, {})) return config @staticmethod @abstractmethod def _get_default_config() -> Dict: raise NotImplementedError @property @abstractmethod def config_key(self) -> str: raise NotImplementedError() def get_result(self) -> Tuple[Any, Optional[float]]: return self.compressed_model, self.new_metric_ths ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/intel.py ================================================ import copy import re from abc import ABC, abstractmethod from pathlib import Path from tempfile import mkdtemp from typing import Dict, Any, Callable import numpy as np import yaml from nebullvm.operations.optimizations.compressors.base import Compressor from nebullvm.optional_modules.neural_compressor import Pruning from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import DataLoader, Dataset, Module from nebullvm.tools.data import DataManager def _get_model_framework(model: Any) -> str: if isinstance(model, Module): return "torch" elif isinstance(model, tf.Module) and model is not None: return "tensorflow" else: return "numpy" class IntelPruningCompressor(Compressor, ABC): def __init__(self, config_file: str = None): super().__init__(config_file) self._temp_dir = mkdtemp() @property def config_key(self) -> str: return "intel_pruning" @staticmethod def _get_default_config() -> Dict: # see https://github.com/intel/neural-compressor/blob/master/neural_compressor/conf/config.py # noqa # for further details config = { "train": { "optimizer": { "SGD": {"learning_rate": 0.001}, }, "criterion": { "CrossEntropyLoss": { "reduction": "mean", "from_logits": False, }, }, "epoch": 10, "start_epoch": 0, "end_epoch": 10, "iteration": 30, "execution_mode": "eager", # either eager or graph # "hostfile": None, # str for multinode training support }, "approach": { "weight_compression": { "initial_sparsity": 0.0, "target_sparsity": 0.60, "start_epoch": 0, "end_epoch": 8, "pruners": [ { "start_epoch": 0, "end_epoch": 8, "prune_type": "basic_magnitude", }, ], } }, } return config def _prepare_pruning_config(self, model: Any): pruning_config = copy.deepcopy(self._config) framework = _get_model_framework(model) config = { "model": { "name": model.__class__.__name__, "framework": framework if framework != "torch" else "pytorch", }, "evaluation": {"accuracy": {"metric": {"topk": 1}}}, "device": "cpu", "tuning": { "random_seed": 1978, "tensorboard": False, "workspace": {"path": self._temp_dir}, }, "pruning": pruning_config, } path_file = Path(self._temp_dir) / "temp.yaml" with open(path_file, "w") as f: yaml.dump(config, f) with open(path_file, "r+") as f: file_str = f.read() file_str = re.sub( "pruners:\n - end_epoch:", "pruners:\n - !Pruner\n end_epoch:", file_str, ) f.seek(0) f.write(file_str) return path_file def execute( self, model: Any, train_input_data: DataManager, eval_input_data: DataManager, metric_drop_ths: float, metric: Callable, ): config_file_pr = self._prepare_pruning_config(model) prune = Pruning(str(config_file_pr)) prune.model = model prune.train_dataloader = self._get_dataloader(train_input_data) prune.eval_dataloader = self._get_dataloader(eval_input_data) self.compressed_model = prune.fit() if self.compressed_model is not None: error = self._compute_error( model, self.compressed_model, eval_input_data, metric ) if error > metric_drop_ths: self.compressed_model = None else: self.new_metric_ths = metric_drop_ths - error @abstractmethod def _compute_error( self, model: Any, compressed_model: Any, eval_input_data: DataManager, metric: Callable, ): raise NotImplementedError @staticmethod @abstractmethod def _get_dataloader(input_data: DataManager): raise NotImplementedError class INCDataset(Dataset): def __init__(self, input_data: DataManager): self.data = input_data self.batch_size = input_data[0][0][0].shape[0] def __len__(self): return sum([batch_inputs[0].shape[0] for batch_inputs, _ in self.data]) def __getitem__(self, idx): batch_idx = int(idx / self.batch_size) item_idx = idx % self.batch_size data = tuple([data[item_idx] for data in self.data[batch_idx][0]]) return data, self.data[batch_idx][1][item_idx] class TorchIntelPruningCompressor(IntelPruningCompressor): @staticmethod def _get_dataloader(input_data: DataManager): bs = input_data[0][0][0].shape[0] ds = INCDataset(input_data) dl = DataLoader(ds, bs) return dl def _compute_error( self, model: Module, compressed_model: Module, eval_input_data: DataManager, metric: Callable, ): if len(eval_input_data) == 0: return np.inf metric_val = 0 for inputs, y in eval_input_data: pred_model = model(*inputs) pred_compressed_model = compressed_model(*inputs) metric_val += metric(pred_model, pred_compressed_model, y) return metric_val / len(eval_input_data) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/scripts/__init__.py ================================================ import json import logging import os.path from pathlib import Path from tempfile import TemporaryDirectory from typing import Tuple, List, Any, Dict import torch from sparseml.onnx.optim import ModelAnalyzer, pruning_loss_sens_magnitude from sparseml.pytorch.optim import ( ScheduledModifierManager, ) from sparseml.pytorch.sparsification import ( EpochRangeModifier, GMPruningModifier, ) from sparseml.pytorch.utils import ModuleExporter from sparsify.blueprints.utils import ( default_epochs_distribution, PruningModelEvaluator, default_pruning_settings, ) from sparsify.schemas import ProjectModelAnalysisSchema from torch.nn import CrossEntropyLoss, MSELoss from torch.optim import SGD from tqdm.auto import tqdm CRITERION_FNS = { "CrossEntropy": CrossEntropyLoss(), "MSE": MSELoss(), } logging.basicConfig( format=" %(asctime)s [%(levelname)s] %(message)s", datefmt="%d/%m/%Y %I:%M:%S %p", ) logger = logging.getLogger("nebullvm_logger") logger.setLevel(logging.INFO) def _export_model_onnx( model: torch.nn.Module, save_path: Path, model_name: str, input_batch: Tuple, ): if torch.cuda.is_available(): input_batch = tuple(t.cuda() for t in input_batch) model.cuda() exporter = ModuleExporter(model, output_dir=save_path) with torch.no_grad(): example_outputs = model(*input_batch) exporter.export_onnx( input_batch, name=model_name, example_outputs=example_outputs ) onnx_path = save_path / model_name return onnx_path class RecipeBuilder: def __init__(self, model_path): self.model_path = model_path def _make_analysis(self): analyzer = ModelAnalyzer(self.model_path) self.analysis = ProjectModelAnalysisSchema().load(analyzer.dict()) def _compute_loss_sensitivity(self): sensitivities = [] parameters = [] for i, node in enumerate(self.analysis["nodes"]): if node["prunable"]: sensitivities.append(node["prunable_equation_sensitivity"]) parameters.append(node["prunable_params"]) loss_analysis = pruning_loss_sens_magnitude(self.model_path) results_model = loss_analysis.results_model results = loss_analysis.results model = { "baseline_measurement_key": ( str(results_model.baseline_measurement_key) ), "measurements": { str(key): val for key, val in results_model.averages.items() }, } ops = [] for res in results: ops.append( { "id": res.id_, "name": res.name, "index": res.index, "baseline_measurement_key": ( str(res.baseline_measurement_key) ), "measurements": { str(key): val for key, val in res.averages.items() }, } ) pruning = {"model": model, "ops": ops} loss = {} loss["baseline"] = {} loss["pruning"] = pruning model = PruningModelEvaluator( self.analysis, None, loss, ) model.eval_baseline(default_pruning_settings().sparsity) model.eval_pruning(default_pruning_settings()) self.final_analysis = model.to_dict_values() def build_recipe(self, epochs_pruning_window=None, training_epochs=10): self._make_analysis() self._compute_loss_sensitivity() if epochs_pruning_window is None: epochs = default_epochs_distribution(training_epochs) else: # TODO: set custom parameters epochs = default_epochs_distribution(training_epochs) epochs_dict = epochs._asdict() epochs_dict.update(epochs_pruning_window) epochs = epochs.__class__(**epochs_dict) mods = [ EpochRangeModifier( start_epoch=epochs.start_epoch, end_epoch=epochs.end_epoch, ) ] node_weight_name_lookup = { node["id"]: node["weight_name"] for node in self.analysis["nodes"] if node["prunable"] } sparsity_to_params = {} nodes = self.final_analysis[0] for node in nodes: sparsity = node["sparsity"] node_id = node["node_id"] weight_name = node_weight_name_lookup[node_id] if sparsity is None: continue if sparsity not in sparsity_to_params: sparsity_to_params[sparsity] = [] sparsity_to_params[sparsity].append(weight_name) for sparsity, params in sparsity_to_params.items(): gm_pruning = GMPruningModifier( init_sparsity=0.05, final_sparsity=sparsity, start_epoch=epochs.pruning_start_epoch, end_epoch=epochs.pruning_end_epoch, update_frequency=epochs.pruning_update_frequency, params=params, ) mods.append(gm_pruning) return ScheduledModifierManager(mods) class PruningTrainer: def __init__(self, model, bs): self.data_loader = None self.optimizer = None self.model = model self.batch_size = bs def _setup_training(self, loss_fn=None, lr=1e-3, momentum=0.9): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) if loss_fn is None: loss_fn = CrossEntropyLoss() else: loss_fn = CRITERION_FNS.get(loss_fn, CrossEntropyLoss()) self.criterion = loss_fn self.optimizer = SGD(self.model.parameters(), lr=lr, momentum=momentum) def _run_model_one_epoch(self, train=False): if train: self.model.train() data_loader = self.train_data_loader else: self.model.eval() data_loader = self.val_data_loader running_loss = 0.0 for step, (inputs, labels) in tqdm( enumerate(data_loader), total=len(data_loader) ): inputs = tuple(t.to(self.device) for t in inputs) if not isinstance(labels, torch.Tensor): labels = torch.tensor(labels) if len(labels.shape) == 0: labels = labels.unsqueeze(0) labels = labels.to(self.device) if train: self.optimizer.zero_grad() outputs = self.model( *inputs ) # model returns logits and softmax as a tuple loss = self.criterion(outputs, labels) if train: loss.backward() self.optimizer.step() running_loss += loss.item() loss = running_loss / (len(data_loader) + 1e-5) return loss def train( self, manager, train_data_loader, val_data_loader, **train_kwargs ): self.train_data_loader = train_data_loader self.val_data_loader = val_data_loader self._setup_training(**train_kwargs) self.optimizer = manager.modify( self.model, self.optimizer, steps_per_epoch=len(self.train_data_loader), ) self.model.train() # Run model pruning epoch = manager.min_epochs while epoch < manager.max_epochs: # run training loop epoch_name = "{}/{}".format(epoch + 1, manager.max_epochs) logger.info("Running Training Epoch {}".format(epoch_name)) train_loss = self._run_model_one_epoch(train=True) logger.info( ("Training Epoch: {}\nTraining Loss: {}\n").format( epoch_name, train_loss ) ) # run validation loop logger.info("Running Validation Epoch {}".format(epoch_name)) val_loss = self._run_model_one_epoch() logger.info( "Validation Epoch: {}\nVal Loss: {}\n".format( epoch_name, val_loss ) ) epoch += 1 manager.finalize(self.model) return self.model def _load_config(config_file: str): with open(config_file, "r") as f: config = json.load(f) return config def _load_data(data_dir: str): data_dir = Path(data_dir) return [torch.load(input_path) for input_path in data_dir.glob("*.pt")] def _load_model(model_file: str): if os.path.isdir(model_file): path = Path(model_file) module_file = path / "module.py" with open(module_file, "r") as f: module_str = f.read() exec(module_str, globals()) model = eval("NebullvmFxModule")() model.load_state_dict(torch.load(path / "state_dict.pt")) else: model = torch.load(model_file) return model def _train_model( model: torch.nn.Module, train_data: List[Tuple[Tuple, Any]], eval_data: List[Tuple[Tuple, Any]], epochs_pruning_window: Dict = None, training_epochs: int = 10, lr: float = 1e-3, momentum: float = 0.9, loss_fn: str = "CrossEntropy", ): batch_size = train_data[0][0][0].shape[0] with TemporaryDirectory() as tmp_dir: onnx_path = _export_model_onnx( model, Path(tmp_dir), "model.onnx", train_data[0][0] ) onnx_path = onnx_path.as_posix() recipe = RecipeBuilder(onnx_path) # TODO: implement custom parameters support manager = recipe.build_recipe( epochs_pruning_window=epochs_pruning_window, training_epochs=training_epochs, ) trainer = PruningTrainer(model, batch_size) pruned_model = trainer.train( manager, train_data, eval_data, lr=lr, momentum=momentum ) return pruned_model def _save_model(model: torch.nn.Module, path: str): if path.endswith(".pt"): torch.save(model, path) else: torch.save(model.state_dict(), Path(path) / "pruned_state_dict.pt") def main( model_file: str, train_data_dir: str, eval_data_dir: str, config_file: str, out_file: str, ): config = _load_config(config_file) model = _load_model(model_file) train_data = _load_data(train_data_dir) eval_data = _load_data(eval_data_dir) pruned_model = _train_model(model, train_data, eval_data, **config) _save_model(pruned_model, out_file) if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--model", help="The model to be pruned.") parser.add_argument( "--train_dir", help="The directory contained the pickled training data.", ) parser.add_argument( "--eval_dir", help="The directory contained the pickled test data." ) parser.add_argument("--config", help="The config file.") parser.add_argument( "--pruned_model", help="Path where storing the pruned model." ) args = parser.parse_args() main( model_file=args.model, train_data_dir=args.train_dir, eval_data_dir=args.eval_dir, config_file=args.config, out_file=args.pruned_model, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/scripts/neural_magic_training.py ================================================ import json import logging import os.path from pathlib import Path from tempfile import TemporaryDirectory from typing import Tuple, List, Any, Dict import torch from sparseml.onnx.optim import ModelAnalyzer, pruning_loss_sens_magnitude from sparseml.pytorch.optim import ( ScheduledModifierManager, ) from sparseml.pytorch.sparsification import ( EpochRangeModifier, GMPruningModifier, ) from sparseml.pytorch.utils import ModuleExporter from sparsify.blueprints.utils import ( default_epochs_distribution, PruningModelEvaluator, default_pruning_settings, ) from sparsify.schemas import ProjectModelAnalysisSchema from torch.nn import CrossEntropyLoss, MSELoss from torch.optim import SGD from tqdm.auto import tqdm CRITERION_FNS = { "CrossEntropy": CrossEntropyLoss(), "MSE": MSELoss(), } logging.basicConfig( format=" %(asctime)s [%(levelname)s] %(message)s", datefmt="%d/%m/%Y %I:%M:%S %p", ) logger = logging.getLogger("nebullvm_logger") logger.setLevel(logging.INFO) def _export_model_onnx( model: torch.nn.Module, save_path: Path, model_name: str, input_batch: Tuple, ): if torch.cuda.is_available(): input_batch = tuple(t.cuda() for t in input_batch) model.cuda() exporter = ModuleExporter(model, output_dir=save_path) with torch.no_grad(): example_outputs = model(*input_batch) exporter.export_onnx( input_batch, name=model_name, example_outputs=example_outputs ) onnx_path = save_path / model_name return onnx_path class RecipeBuilder: def __init__(self, model_path): self.model_path = model_path def _make_analysis(self): analyzer = ModelAnalyzer(self.model_path) self.analysis = ProjectModelAnalysisSchema().load(analyzer.dict()) def _compute_loss_sensitivity(self): sensitivities = [] parameters = [] for i, node in enumerate(self.analysis["nodes"]): if node["prunable"]: sensitivities.append(node["prunable_equation_sensitivity"]) parameters.append(node["prunable_params"]) loss_analysis = pruning_loss_sens_magnitude(self.model_path) results_model = loss_analysis.results_model results = loss_analysis.results model = { "baseline_measurement_key": ( str(results_model.baseline_measurement_key) ), "measurements": { str(key): val for key, val in results_model.averages.items() }, } ops = [] for res in results: ops.append( { "id": res.id_, "name": res.name, "index": res.index, "baseline_measurement_key": ( str(res.baseline_measurement_key) ), "measurements": { str(key): val for key, val in res.averages.items() }, } ) pruning = {"model": model, "ops": ops} loss = {} loss["baseline"] = {} loss["pruning"] = pruning model = PruningModelEvaluator( self.analysis, None, loss, ) model.eval_baseline(default_pruning_settings().sparsity) model.eval_pruning(default_pruning_settings()) self.final_analysis = model.to_dict_values() def build_recipe(self, epochs_pruning_window=None, training_epochs=10): self._make_analysis() self._compute_loss_sensitivity() if epochs_pruning_window is None: epochs = default_epochs_distribution(training_epochs) else: # TODO: set custom parameters epochs = default_epochs_distribution(training_epochs) epochs_dict = epochs._asdict() epochs_dict.update(epochs_pruning_window) epochs = epochs.__class__(**epochs_dict) mods = [ EpochRangeModifier( start_epoch=epochs.start_epoch, end_epoch=epochs.end_epoch, ) ] node_weight_name_lookup = { node["id"]: node["weight_name"] for node in self.analysis["nodes"] if node["prunable"] } sparsity_to_params = {} nodes = self.final_analysis[0] for node in nodes: sparsity = node["sparsity"] node_id = node["node_id"] weight_name = node_weight_name_lookup[node_id] if sparsity is None: continue if sparsity not in sparsity_to_params: sparsity_to_params[sparsity] = [] sparsity_to_params[sparsity].append(weight_name) for sparsity, params in sparsity_to_params.items(): gm_pruning = GMPruningModifier( init_sparsity=0.05, final_sparsity=sparsity, start_epoch=epochs.pruning_start_epoch, end_epoch=epochs.pruning_end_epoch, update_frequency=epochs.pruning_update_frequency, params=params, ) mods.append(gm_pruning) return ScheduledModifierManager(mods) class PruningTrainer: def __init__(self, model, bs): self.data_loader = None self.optimizer = None self.model = model self.batch_size = bs def _setup_training(self, loss_fn=None, lr=1e-3, momentum=0.9): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) if loss_fn is None: loss_fn = CrossEntropyLoss() else: loss_fn = CRITERION_FNS.get(loss_fn, CrossEntropyLoss()) self.criterion = loss_fn self.optimizer = SGD(self.model.parameters(), lr=lr, momentum=momentum) def _run_model_one_epoch(self, train=False): if train: self.model.train() data_loader = self.train_data_loader else: self.model.eval() data_loader = self.val_data_loader running_loss = 0.0 for step, (inputs, labels) in tqdm( enumerate(data_loader), total=len(data_loader) ): inputs = tuple(t.to(self.device) for t in inputs) if not isinstance(labels, torch.Tensor): labels = torch.tensor(labels) if len(labels.shape) == 0: labels = labels.unsqueeze(0) labels = labels.to(self.device) if train: self.optimizer.zero_grad() outputs = self.model( *inputs ) # model returns logits and softmax as a tuple loss = self.criterion(outputs, labels) if train: loss.backward() self.optimizer.step() running_loss += loss.item() loss = running_loss / (len(data_loader) + 1e-5) return loss def train( self, manager, train_data_loader, val_data_loader, **train_kwargs ): self.train_data_loader = train_data_loader self.val_data_loader = val_data_loader self._setup_training(**train_kwargs) self.optimizer = manager.modify( self.model, self.optimizer, steps_per_epoch=len(self.train_data_loader), ) self.model.train() # Run model pruning epoch = manager.min_epochs while epoch < manager.max_epochs: # run training loop epoch_name = "{}/{}".format(epoch + 1, manager.max_epochs) logger.info("Running Training Epoch {}".format(epoch_name)) train_loss = self._run_model_one_epoch(train=True) logger.info( ("Training Epoch: {}\nTraining Loss: {}\n").format( epoch_name, train_loss ) ) # run validation loop logger.info("Running Validation Epoch {}".format(epoch_name)) val_loss = self._run_model_one_epoch() logger.info( "Validation Epoch: {}\nVal Loss: {}\n".format( epoch_name, val_loss ) ) epoch += 1 manager.finalize(self.model) return self.model def _load_config(config_file: str): with open(config_file, "r") as f: config = json.load(f) return config def _load_data(data_dir: str): data_dir = Path(data_dir) return [torch.load(input_path) for input_path in data_dir.glob("*.pt")] def _load_model(model_file: str): if os.path.isdir(model_file): path = Path(model_file) module_file = path / "module.py" with open(module_file, "r") as f: module_str = f.read() exec(module_str, globals()) model = eval("NebullvmFxModule")() model.load_state_dict(torch.load(path / "state_dict.pt")) else: model = torch.load(model_file) return model def _train_model( model: torch.nn.Module, train_data: List[Tuple[Tuple, Any]], eval_data: List[Tuple[Tuple, Any]], epochs_pruning_window: Dict = None, training_epochs: int = 10, lr: float = 1e-3, momentum: float = 0.9, loss_fn: str = "CrossEntropy", ): batch_size = train_data[0][0][0].shape[0] with TemporaryDirectory() as tmp_dir: onnx_path = _export_model_onnx( model, Path(tmp_dir), "model.onnx", train_data[0][0] ) onnx_path = onnx_path.as_posix() recipe = RecipeBuilder(onnx_path) # TODO: implement custom parameters support manager = recipe.build_recipe( epochs_pruning_window=epochs_pruning_window, training_epochs=training_epochs, ) trainer = PruningTrainer(model, batch_size) pruned_model = trainer.train( manager, train_data, eval_data, lr=lr, momentum=momentum ) return pruned_model def _save_model(model: torch.nn.Module, path: str): if path.endswith(".pt"): torch.save(model, path) else: torch.save(model.state_dict(), Path(path) / "pruned_state_dict.pt") def main( model_file: str, train_data_dir: str, eval_data_dir: str, config_file: str, out_file: str, ): config = _load_config(config_file) model = _load_model(model_file) train_data = _load_data(train_data_dir) eval_data = _load_data(eval_data_dir) pruned_model = _train_model(model, train_data, eval_data, **config) _save_model(pruned_model, out_file) if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--model", help="The model to be pruned.") parser.add_argument( "--train_dir", help="The directory contained the pickled training data.", ) parser.add_argument( "--eval_dir", help="The directory contained the pickled test data." ) parser.add_argument("--config", help="The config file.") parser.add_argument( "--pruned_model", help="Path where storing the pruned model." ) args = parser.parse_args() main( model_file=args.model, train_data_dir=args.train_dir, eval_data_dir=args.eval_dir, config_file=args.config, out_file=args.pruned_model, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/sparseml.py ================================================ import json from pathlib import Path from tempfile import TemporaryDirectory from typing import Callable, Dict import numpy as np from loguru import logger from nebullvm.operations.optimizations.compressors.base import Compressor from nebullvm.optional_modules.torch import torch, Module from nebullvm.tools.data import DataManager from nebullvm.tools.pytorch import save_with_torch_fx, load_with_torch_fx from nebullvm.tools.venv import run_in_different_venv def _save_model(model: Module, path: Path): try: save_with_torch_fx(model, path) except Exception as ex: logger.warning( f"Got an error while exporting with TorchFX. The model will be " f"saved using the standard PyTorch save pickling method. Error " f"got: {ex}" ) torch.save(model, path / "model.pt") return path / "model.pt" else: return path def _load_model(path: Path): if path.is_file(): return torch.load(path) else: return load_with_torch_fx(path) def _save_dataset(input_data: DataManager, path: Path): path.mkdir(exist_ok=True) for i, x in enumerate(input_data): torch.save(x, path / f"input_{i}.pt") def _save_json(dictionary: Dict, path: Path): with open(path, "w") as f: json.dump(dictionary, f) def _write_requirements_file(path: Path): requirements = "sparseml\nsparsify\ntqdm" with open(path, "w") as f: f.write(requirements) class SparseMLCompressor(Compressor): def execute( self, model: Module, train_input_data: DataManager, eval_input_data: DataManager, metric_drop_ths: float, metric: Callable, ): script_path = ( Path(__file__).parent / "scripts/neural_magic_training.py" ) with TemporaryDirectory(dir="") as tmp_dir: tmp_dir = Path(tmp_dir) requirements_file = tmp_dir / "requirements.txt" model_path = _save_model(model, tmp_dir) training_data_dir = tmp_dir / "train" eval_data_dir = tmp_dir / "eval" config_file = tmp_dir / "config.json" pruned_model_path = ( tmp_dir / "pruned_model.pt" if model_path.is_file() else tmp_dir ) _write_requirements_file(requirements_file) _save_dataset(train_input_data, training_data_dir) _save_dataset(eval_input_data, eval_data_dir) _save_json(self._config, config_file) run_in_different_venv( str(requirements_file), str(script_path), torch.cuda.is_available(), "--model", f"{model_path}", "--train_dir", f"{training_data_dir}", "--eval_dir", f"{eval_data_dir}", "--config", f"{config_file}", "--pruned_model", f"{pruned_model_path}", ) self.compressed_model = _load_model(pruned_model_path) if self.compressed_model is not None: error = self._compute_error( model, self.compressed_model, eval_input_data, metric ) if error > metric_drop_ths: self.compressed_model = None else: self.new_metric_ths = metric_drop_ths - error @staticmethod @torch.no_grad() def _compute_error( model: Module, pruned_model: Module, eval_input_data: DataManager, metric: Callable, ) -> float: if len(eval_input_data) == 0: return np.inf metric_val = 0.0 model.eval() pruned_model.eval() for inputs, y in eval_input_data: if torch.cuda.is_available(): inputs = tuple(data.cuda() for data in inputs) pruned_model.cuda() model.cuda() model_pred = model(*inputs) pruned_pred = pruned_model(*inputs) metric_val += metric(model_pred, pruned_pred, y) return metric_val / len(eval_input_data) @staticmethod def _get_default_config() -> Dict: return { "training_epochs": 10, "epochs_pruning_window": {"start_epoch": 0, "end_epoch": 10}, "loss_fn": "CrossEntropy", "lr": 1e-3, "momentum": 0.9, } @property def config_key(self) -> str: return "sparseml" ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimize_inference.py ================================================ from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Iterable, Callable, List, Union, Dict, Optional from nebullvm.config import TRAIN_TEST_SPLIT_RATIO from nebullvm.core import types from nebullvm.core.models import ( OptimizeInferenceResult, OriginalModel, OptimizedModel, BenchmarkOriginalModelResult, ModelCompiler, ModelCompressor, OptimizationTime, ModelParams, DeepLearningFramework, ) from nebullvm.operations.base import Operation from nebullvm.operations.conversions.utils import get_conversion_op from nebullvm.operations.measures.measures import LatencyOriginalModelMeasure from nebullvm.operations.measures.utils import QUANTIZATION_METRIC_MAP from nebullvm.operations.optimizations.optimizers.optimizers import ( PytorchOptimizer, TensorflowOptimizer, ONNXOptimizer, ) from nebullvm.operations.optimizations.utils import ( map_compilers_and_compressors, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import DataLoader as TorchDataLoader from nebullvm.optional_modules.torch import torch from nebullvm.optional_modules.utils import ( check_dependencies, ) from nebullvm.tools.adapters import ( ModelAdapter, DiffusionAdapter, HuggingFaceAdapter, ) from nebullvm.tools.data import DataManager from nebullvm.tools.diffusers import ( is_diffusion_model_pipe, is_diffusion_model, ) from nebullvm.tools.hardware_utils import get_hw_setup from nebullvm.tools.utils import ( is_huggingface_data, check_input_data, is_data_subscriptable, get_dl_framework, extract_info_from_data, get_model_name, get_model_size_mb, get_throughput, ) class OptimizeInferenceOp(Operation): def __init__(self): super().__init__() self.torch_optimization_op = PytorchOptimizer() self.onnx_optimization_op = ONNXOptimizer() self.tensorflow_optimization_op = TensorflowOptimizer() @staticmethod def _as_data_manager(data) -> DataManager: if isinstance(data, DataManager): return data if check_input_data(data) is False: raise ValueError( "The provided data does not match the expected " "format.\n" "Speedster supports data in the following formats: \n" "- PyTorch DataLoader\n" "- TensorFlow Dataset\n" "- List of tuples: [((input_0, ... ), label), ...] \n" "Inputs and labels should be either tensors or numpy " "arrays,\n" "depending on the framework used.\n" ) if is_data_subscriptable(data): return DataManager(data) else: return DataManager.from_iterable(data) @staticmethod def _check_inputs(model: Any, input_data: types.InputData): if model is None: raise ValueError("Input model cannot be None") if len(input_data) == 0: raise ValueError("Input data cannot be empty") def execute( self, model: Any, input_data: types.InputData, metric_drop_ths: float = None, metric: Union[str, Callable] = None, optimization_time: str = "constrained", dynamic_info: Dict = None, config_file: str = None, ignore_compilers: List[str] = None, ignore_compressors: List[str] = None, store_latencies: bool = False, **kwargs, ) -> OptimizeInferenceResult: self._check_inputs(model, input_data) check_dependencies(self.device) ignore_compilers = map_compilers_and_compressors( ignore_compilers, ModelCompiler ) ignore_compressors = map_compilers_and_compressors( ignore_compressors, ModelCompressor ) optimization_time = OptimizationTime(optimization_time) data = input_data if isinstance(data, (TorchDataLoader, tf.data.Dataset)): try: data = DataManager.from_dataloader(data) except Exception: raise ValueError( "The provided dataloader does not match the expected " "format.\n" "Speedster supports dataloaders that return tuples in " "the\n" "following formats: \n" "Single input: (input, label)\n" "Multiple inputs: ((input1, input2, ...), label) or " "(input1, input2, ..., label)\n" "Inputs and labels should be either tensors or numpy " "arrays,\n" "depending on the framework used.\n" ) # Setup adapters model_adapter: Optional[ModelAdapter] = None if is_diffusion_model_pipe(model): self.logger.info( "The provided model is a diffusion model. " "Speedster will optimize the UNet part of the model." ) model_adapter = DiffusionAdapter(model, data, self.device) elif is_huggingface_data(data[0]): model_adapter = HuggingFaceAdapter( model, data, self.device, **kwargs ) if dynamic_info is None: self.logger.warning( "Dynamic shape info has not been provided for the " "HuggingFace model. The resulting optimized model " "will be usable only with a fixed input shape. " "To optimize the model for dynamic shapes, please " "look here: https://nebuly.gitbook.io/nebuly/modules/" "speedster/how-to-guides" "#using-dynamic-shape." ) # Adapt data and model if model_adapter is not None: data = model_adapter.adapted_data model = model_adapter.adapted_model data = self._as_data_manager(data) dl_framework = get_dl_framework(model) if metric_drop_ths is not None and metric_drop_ths <= 0: metric_drop_ths = None elif metric_drop_ths is not None and metric is None: metric = "numeric_precision" if isinstance(metric, str): metric = QUANTIZATION_METRIC_MAP.get(metric) model_params: ModelParams = extract_info_from_data( model=model, input_data=data, dl_framework=dl_framework, dynamic_info=dynamic_info, device=self.device, is_diffusion=is_diffusion_model(model), ) data.split(TRAIN_TEST_SPLIT_RATIO) # -------- Benchmark original model -------- original_latency_op = LatencyOriginalModelMeasure().to(self.device) orig_model_benchmark: BenchmarkOriginalModelResult = ( original_latency_op.execute( model=model, input_data=data.get_split("test"), dl_framework=dl_framework, ) ) original_model = OriginalModel( model=model, latency_seconds=orig_model_benchmark.latency_seconds, name=get_model_name(model), size_mb=get_model_size_mb(model), framework=dl_framework, throughput=get_throughput( latency=orig_model_benchmark.latency_seconds, # Normal models have batch size B, diffusion # models have batch size 2B batch_size=model_params.batch_size if not is_diffusion_model(model) else model_params.batch_size / 2, ), ) # ------------------------------------------ with TemporaryDirectory() as tmp_dir: tmp_dir = Path(tmp_dir) / "fp32" tmp_dir.mkdir(parents=True, exist_ok=True) # Convert model to all available frameworks conversion_op = get_conversion_op(dl_framework) conversion_op.to(self.device).set_state(model, data).execute( save_path=tmp_dir, model_params=model_params, ) # Optimize models optimized_models: List[OptimizedModel] = [] is_diffusion = is_diffusion_model(model) for i, model in enumerate(conversion_op.get_result()): optimized_models += self._optimize( model=model, input_data=data, model_outputs=orig_model_benchmark.model_outputs, optimization_time=optimization_time, metric_drop_ths=metric_drop_ths, metric=metric, model_params=model_params, ignore_compilers=ignore_compilers, ignore_compressors=ignore_compressors, source_dl_framework=dl_framework, pipeline_idx=i + 1, len_pipelines=len(conversion_op.get_result()), is_diffusion=is_diffusion, ) optimized_models.sort(key=lambda x: x.latency_seconds, reverse=False) # Check if at least one optimized model has been created no_optimized_models = len(optimized_models) < 1 no_inference_learners = all( o.inference_learner is None for o in optimized_models ) if no_optimized_models or no_inference_learners: self.logger.warning( "No optimized model has been created. This is likely " "due to a bug during optimization. Please open an issue " "and report in details your use case." ) # Extract lowest-latency model lowest_latency = self._extract_lowest_latency_model(optimized_models) if model_adapter is not None: original_model = model_adapter.adapt_original_model(original_model) lowest_latency = model_adapter.adapt_inference_learner( lowest_latency ) return OptimizeInferenceResult( original_model=original_model, optimized_model=lowest_latency, hardware_setup=get_hw_setup(), ) def _optimize( self, model: Any, model_outputs: Iterable, input_data: types.InputData, optimization_time: OptimizationTime, metric_drop_ths: float, metric: Callable, model_params: ModelParams, ignore_compilers: List[ModelCompiler], ignore_compressors: List[ModelCompressor], source_dl_framework: DeepLearningFramework, pipeline_idx: int, len_pipelines: int, is_diffusion: bool, ) -> List[OptimizedModel]: if isinstance(model, torch.nn.Module): optimization_op = self.torch_optimization_op self.logger.info( f"[{pipeline_idx}/{len_pipelines}] Running PyTorch " f"Optimization Pipeline" ) elif isinstance(model, tf.Module): optimization_op = self.tensorflow_optimization_op self.logger.info( f"[{pipeline_idx}/{len_pipelines}] Running TensorFlow " f"Optimization Pipeline" ) else: optimization_op = self.onnx_optimization_op self.logger.info( f"[{pipeline_idx}/{len_pipelines}] Running ONNX " f"Optimization Pipeline" ) # Run optimization optimized_models = optimization_op.to(self.device).execute( model=model, input_data=input_data, optimization_time=optimization_time, metric_drop_ths=metric_drop_ths, metric=metric, model_params=model_params, model_outputs=model_outputs, ignore_compilers=ignore_compilers, ignore_compressors=ignore_compressors, source_dl_framework=source_dl_framework, is_diffusion=is_diffusion, ) if isinstance(model, torch.nn.Module): optimization_op.free_model_gpu(model) return optimized_models @staticmethod def _extract_lowest_latency_model( models: List[OptimizedModel], ) -> Optional[OptimizedModel]: # fmt: off inference_learner_models = [ m for m in models if m.inference_learner is not None ] # fmt: on if len(inference_learner_models) == 0: return None return min(inference_learner_models, key=lambda m: m.latency_seconds) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/base.py ================================================ import abc from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List, Tuple, Type, Union from nebullvm.config import ACTIVATION_METRIC_DROP_THS from nebullvm.core.models import ( OptimizedModel, OptimizationTime, ModelParams, ModelCompiler, ModelCompressor, DeepLearningFramework, DeviceType, QuantizationType, ) from nebullvm.operations.base import Operation from nebullvm.operations.inference_learners.base import ( BuildInferenceLearner, ) from nebullvm.operations.inference_learners.builders import ( DeepSparseBuildInferenceLearner, FasterTransformerBuildInferenceLearner, IntelNeuralCompressorBuildInferenceLearner, ONNXApacheTVMBuildInferenceLearner, ONNXBuildInferenceLearner, ONNXTensorRTBuildInferenceLearner, OpenVINOBuildInferenceLearner, PyTorchApacheTVMBuildInferenceLearner, PyTorchTensorRTBuildInferenceLearner, TensorflowBuildInferenceLearner, TFLiteBuildInferenceLearner, TorchNeuronBuildInferenceLearner, TorchXLABuildInferenceLearner, TorchDynamoBuildInferenceLearner, TorchScriptBuildInferenceLearner, ) from nebullvm.operations.measures.measures import MetricDropMeasure from nebullvm.operations.measures.utils import ( compute_optimized_running_time, compute_relative_difference, ) from nebullvm.operations.optimizations.compilers.base import Compiler from nebullvm.operations.optimizations.compilers.deepsparse import ( DeepSparseCompiler, ) from nebullvm.operations.optimizations.compilers.faster_transformer import ( FasterTransformerCompiler, ) from nebullvm.operations.optimizations.compilers.intel_neural_compressor import ( # noqa: E501 IntelNeuralCompressorCompiler, ) from nebullvm.operations.optimizations.compilers.onnxruntime import ( ONNXCompiler, ) from nebullvm.operations.optimizations.compilers.openvino import ( OpenVINOCompiler, ) from nebullvm.operations.optimizations.compilers.tensor_rt import ( ONNXTensorRTCompiler, PyTorchTensorRTCompiler, ) from nebullvm.operations.optimizations.compilers.tensorflow import ( TensorflowBackendCompiler, TFLiteBackendCompiler, ) from nebullvm.operations.optimizations.compilers.torch_dynamo import ( TorchDynamoCompiler, ) from nebullvm.operations.optimizations.compilers.torch_neuron import ( TorchNeuronCompiler, ) from nebullvm.operations.optimizations.compilers.torch_xla import ( TorchXLACompiler, ) from nebullvm.operations.optimizations.compilers.torchscript import ( TorchScriptCompiler, ) from nebullvm.operations.optimizations.compilers.tvm import ( ONNXApacheTVMCompiler, PyTorchApacheTVMCompiler, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation from nebullvm.tools.utils import get_throughput class Optimizer(Operation, abc.ABC): def __init__(self): super().__init__() self.optimized_models = [] self.source_dl_framework = None self.pipeline_dl_framework = None self.compiler_ops = {} self.build_inference_learner_ops = {} self.validity_check_op = MetricDropMeasure() def execute( self, model: Any, input_data: DataManager, optimization_time: OptimizationTime, metric_drop_ths: float, metric: Callable, model_params: ModelParams, model_outputs: List[Tuple[Any, ...]], ignore_compilers: List[ModelCompiler], ignore_compressors: List[ModelCompressor], source_dl_framework: DeepLearningFramework, is_diffusion: bool = False, ) -> List[OptimizedModel]: self.source_dl_framework = source_dl_framework # TODO: implement and select compressors from hardware compilers = self._select_compilers_from_hardware() remove_compiler_list = [] add_compiler_list = [] for compiler in ignore_compilers: if compiler in MULTI_FRAMEWORK_COMPILERS: add_compiler_list += MULTI_FRAMEWORK_COMPILERS[compiler] remove_compiler_list.append(compiler) for c in remove_compiler_list: ignore_compilers.remove(c) ignore_compilers += add_compiler_list ( self.compiler_ops, self.build_inference_learner_ops, ) = self._load_compilers( ignore_compilers=ignore_compilers, compilers=compilers, ) self._optimize( model=model, input_data=input_data, optimization_time=optimization_time, metric_drop_ths=metric_drop_ths, metric=metric, model_params=model_params, model_outputs=model_outputs, ignore_compilers=ignore_compilers, is_diffusion=is_diffusion, ) return self.optimized_models @abc.abstractmethod def _select_compilers_from_hardware(self): raise NotImplementedError() @staticmethod def _load_compilers( ignore_compilers: List[ModelCompiler], compilers: List[ModelCompiler], ): compiler_ops = { compiler: COMPILER_TO_OPTIMIZER_MAP[compiler]() for compiler in compilers if compiler not in ignore_compilers and compiler in COMPILER_TO_OPTIMIZER_MAP } build_inference_learner_ops = { compiler: COMPILER_TO_INFERENCE_LEARNER_MAP[compiler]() for compiler in compilers if compiler not in ignore_compilers and compiler in COMPILER_TO_OPTIMIZER_MAP } return compiler_ops, build_inference_learner_ops def free_model_gpu(self, model: Any): # Free gpu memory if self.device.type is DeviceType.GPU: try: model.cpu() except Exception: pass try: with torch.cuda.device(self.device.to_torch_format()): torch.cuda.empty_cache() except Exception: pass def _optimize( self, model: Union[torch.nn.Module, tf.Module, str], input_data: DataManager, optimization_time: OptimizationTime, metric_drop_ths: float, metric: Callable, model_params: ModelParams, model_outputs: List[Tuple[Any, ...]], ignore_compilers: List[ModelCompiler], is_diffusion: bool = False, ): if metric_drop_ths is not None: q_types = [ None, ] if metric_drop_ths > 0: q_types.append(QuantizationType.HALF) if metric_drop_ths > ACTIVATION_METRIC_DROP_THS: q_types.append(QuantizationType.DYNAMIC) if input_data is not None: q_types.append(QuantizationType.STATIC) else: q_types = [None] optimization_info = [] for compiler, compiler_op, build_inference_learner_op in zip( self.compiler_ops.keys(), self.compiler_ops.values(), self.build_inference_learner_ops.values(), ): for q_type in q_types: input_tfms = MultiStageTransformation([]) self.free_model_gpu(model) with TemporaryDirectory() as tmp_dir: try: compiler_op.to(self.device).execute( model=model, input_data=input_data, model_params=model_params, metric_drop_ths=metric_drop_ths if q_type is not None else None, quantization_type=q_type, input_tfms=input_tfms, onnx_output_path=tmp_dir, is_diffusion=is_diffusion, ) compiled_model = compiler_op.get_result() if compiled_model is not None: build_inference_learner_op.to(self.device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=self.source_dl_framework, quantization_type=q_type, ) inference_learner = ( build_inference_learner_op.get_result() ) if inference_learner is not None: test_input_data, ys = input_data.get_split( "test" ).get_list(with_ys=True) self.validity_check_op.execute( inference_learner, test_input_data, model_outputs, metric_drop_ths, metric_func=metric if q_type is not None else compute_relative_difference, ys=ys, ) if self.validity_check_op.valid: latency = compute_optimized_running_time( inference_learner, input_data ) self.logger.info( f"Optimized model latency: {latency} " f"sec/iter" ) if ( compiler not in ignore_compilers and optimization_time is OptimizationTime.CONSTRAINED ): ignore_compilers.append(compiler) self.optimized_models.append( OptimizedModel( inference_learner=inference_learner, # noqa: E501 metric_drop=self.validity_check_op.measure_result, # noqa: E501 compiler=compiler, technique=q_type.name if q_type is not None else "None", latency_seconds=latency, throughput=get_throughput( latency, # Normal models have batch # size B, diffusion models # have batch size 2B model_params.batch_size if not is_diffusion else model_params.batch_size / 2, ), size_mb=inference_learner.get_size() # noqa: E501 / 1e6, ) ) opt_info_dict = { "compiler": f"{self.pipeline_dl_framework.value}_{compiler.value}", # noqa: E501 "technique": q_type.value if q_type else "none", "latency": latency, } if ( metric_drop_ths is not None and q_type is not None ): opt_info_dict[ "metric_loss" ] = ( self.validity_check_op.measure_result # noqa: E501 ) opt_info_dict[ "metric" ] = metric.__name__ optimization_info.append(opt_info_dict) else: self.logger.warning( "The optimized model will be " "discarded due to poor results " "obtained with the given metric." ) if self.device.type in [ DeviceType.GPU, DeviceType.TPU, ]: inference_learner.free_gpu_memory() except Exception as ex: self.logger.warning( f"Optimization failed with " f"{self.pipeline_dl_framework} " f"interface of {compiler}. Got error {ex}. " f"If possible the compilation will be re-scheduled" f" with another interface. Please consult the " f"documentation for further info or open an issue " f"on GitHub for receiving assistance." ) optimization_info.append( { "compiler": compiler.value, "technique": q_type.value if q_type else "none", "latency": -1, } ) if self.feedback_collector is not None: self.feedback_collector.store_info( key="optimizations", value=optimization_info, ) MULTI_FRAMEWORK_COMPILERS = { ModelCompiler.TENSOR_RT: [ ModelCompiler.TENSOR_RT_TORCH, ModelCompiler.TENSOR_RT_ONNX, ], ModelCompiler.APACHE_TVM: [ ModelCompiler.APACHE_TVM_TORCH, ModelCompiler.APACHE_TVM_ONNX, ], } COMPILER_TO_OPTIMIZER_MAP: Dict[ModelCompiler, Type[Compiler]] = { ModelCompiler.TORCHSCRIPT: TorchScriptCompiler, ModelCompiler.DEEPSPARSE: DeepSparseCompiler, ModelCompiler.INTEL_NEURAL_COMPRESSOR: IntelNeuralCompressorCompiler, ModelCompiler.TENSOR_RT_TORCH: PyTorchTensorRTCompiler, ModelCompiler.TENSOR_RT_ONNX: ONNXTensorRTCompiler, ModelCompiler.APACHE_TVM_TORCH: PyTorchApacheTVMCompiler, ModelCompiler.APACHE_TVM_ONNX: ONNXApacheTVMCompiler, ModelCompiler.ONNX_RUNTIME: ONNXCompiler, ModelCompiler.OPENVINO: OpenVINOCompiler, ModelCompiler.TFLITE: TFLiteBackendCompiler, ModelCompiler.XLA: TensorflowBackendCompiler, ModelCompiler.TORCH_NEURON: TorchNeuronCompiler, ModelCompiler.TORCH_XLA: TorchXLACompiler, ModelCompiler.TORCH_DYNAMO: TorchDynamoCompiler, ModelCompiler.FASTER_TRANSFORMER: FasterTransformerCompiler, } COMPILER_TO_INFERENCE_LEARNER_MAP: Dict[ ModelCompiler, Type[BuildInferenceLearner] ] = { ModelCompiler.TORCHSCRIPT: TorchScriptBuildInferenceLearner, ModelCompiler.DEEPSPARSE: DeepSparseBuildInferenceLearner, ModelCompiler.INTEL_NEURAL_COMPRESSOR: IntelNeuralCompressorBuildInferenceLearner, # noqa: E501 ModelCompiler.TENSOR_RT_TORCH: PyTorchTensorRTBuildInferenceLearner, ModelCompiler.TENSOR_RT_ONNX: ONNXTensorRTBuildInferenceLearner, ModelCompiler.APACHE_TVM_TORCH: PyTorchApacheTVMBuildInferenceLearner, ModelCompiler.APACHE_TVM_ONNX: ONNXApacheTVMBuildInferenceLearner, ModelCompiler.ONNX_RUNTIME: ONNXBuildInferenceLearner, ModelCompiler.OPENVINO: OpenVINOBuildInferenceLearner, ModelCompiler.TFLITE: TFLiteBuildInferenceLearner, ModelCompiler.XLA: TensorflowBuildInferenceLearner, ModelCompiler.TORCH_NEURON: TorchNeuronBuildInferenceLearner, ModelCompiler.TORCH_XLA: TorchXLABuildInferenceLearner, ModelCompiler.TORCH_DYNAMO: TorchDynamoBuildInferenceLearner, ModelCompiler.FASTER_TRANSFORMER: FasterTransformerBuildInferenceLearner, } ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/optimizers.py ================================================ import platform from nebullvm.core.models import ( DeepLearningFramework, DeviceType, ModelCompiler, ) from nebullvm.operations.optimizations.optimizers.base import Optimizer from nebullvm.operations.optimizations.compilers.utils import ( tvm_is_available, bladedisc_is_available, deepsparse_is_available, intel_neural_compressor_is_available, torch_tensorrt_is_available, onnxruntime_is_available, tensorrt_is_available, openvino_is_available, torch_neuron_is_available, torch_xla_is_available, faster_transformer_is_available, ) from nebullvm.optional_modules.torch import torch from nebullvm.optional_modules.utils import ( torch_is_available, tensorflow_is_available, onnx_is_available, ) from nebullvm.tools.utils import check_module_version class PytorchOptimizer(Optimizer): def __init__(self): super().__init__() self.pipeline_dl_framework = DeepLearningFramework.PYTORCH def _select_compilers_from_hardware(self): compilers = [] if torch_is_available(): if self.device.type is DeviceType.TPU: if torch_xla_is_available(): compilers.append(ModelCompiler.TORCH_XLA) else: raise RuntimeError( "Torch XLA is not available on your platform. " "Please install torch-xla the readme at this " "link: https://github.com/pytorch/xla" ) elif self.device.type is DeviceType.NEURON: if torch_neuron_is_available(): compilers.append(ModelCompiler.TORCH_NEURON) else: raise RuntimeError( "Torch Neuron is not available on your platform. " "Please install torch-neuron by following " "this guide: https://awsdocs-neuron" ".readthedocs-hosted.com/en/latest/general/" "quick-start/torch-neuron.html." ) else: compilers.append(ModelCompiler.TORCHSCRIPT) if ( check_module_version(torch, min_version="2.0.0") and platform.system() != "Windows" and False ): # Deactivated because save and load methods are # not implemented compilers.append(ModelCompiler.TORCH_DYNAMO) if tvm_is_available(): compilers.append(ModelCompiler.APACHE_TVM_TORCH) if bladedisc_is_available(): compilers.append(ModelCompiler.BLADEDISC) if self.device.type is DeviceType.CPU: if deepsparse_is_available(): compilers.append(ModelCompiler.DEEPSPARSE) if intel_neural_compressor_is_available(): compilers.append(ModelCompiler.INTEL_NEURAL_COMPRESSOR) elif self.device.type is DeviceType.GPU: if torch_tensorrt_is_available(): compilers.append(ModelCompiler.TENSOR_RT_TORCH) if faster_transformer_is_available(): compilers.append(ModelCompiler.FASTER_TRANSFORMER) return compilers class TensorflowOptimizer(Optimizer): def __init__(self): super().__init__() self.pipeline_dl_framework = DeepLearningFramework.TENSORFLOW def _select_compilers_from_hardware(self): compilers = [] if tensorflow_is_available(): compilers.append(ModelCompiler.XLA) compilers.append(ModelCompiler.TFLITE) return compilers class ONNXOptimizer(Optimizer): def __init__(self): super().__init__() self.pipeline_dl_framework = DeepLearningFramework.NUMPY def _select_compilers_from_hardware(self): compilers = [] if onnx_is_available(): if onnxruntime_is_available(): compilers.append(ModelCompiler.ONNX_RUNTIME) if tvm_is_available(): compilers.append(ModelCompiler.APACHE_TVM_ONNX) if self.device.type is DeviceType.GPU and tensorrt_is_available(): compilers.append(ModelCompiler.TENSOR_RT_ONNX) if self.device.type is DeviceType.CPU and openvino_is_available(): compilers.append(ModelCompiler.OPENVINO) return compilers ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_deepsparse.py ================================================ from tempfile import TemporaryDirectory import pytest import torch from nebullvm.config import CONSTRAINED_METRIC_DROP_THS from nebullvm.core.models import ( Device, DeviceType, DeepLearningFramework, ModelCompiler, ) from nebullvm.operations.inference_learners.deepsparse import ( DEEPSPARSE_INFERENCE_LEARNERS, ) from nebullvm.operations.measures.measures import MetricDropMeasure from nebullvm.operations.measures.utils import compute_relative_difference from nebullvm.operations.optimizations.compilers.deepsparse import ( DeepSparseCompiler, ) from nebullvm.operations.optimizations.compilers.utils import ( deepsparse_is_available, ) from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import initialize_model from nebullvm.operations.inference_learners.utils import load_model device = Device(DeviceType.CPU) @pytest.mark.parametrize( ("output_library", "dynamic"), [ # (DeepLearningFramework.PYTORCH, True), (DeepLearningFramework.PYTORCH, False), ], ) @pytest.mark.skipif( not deepsparse_is_available(), reason="Can't test deepsparse if it's not installed.", ) def test_deepsparse( output_library: DeepLearningFramework, dynamic: bool, quantization_type=None, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, None, output_library, device) compiler_op = DeepSparseCompiler() compiler_op.to(device).execute( model=model, onnx_output_path=tmp_dir, model_params=model_params, quantization_type=None, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.DEEPSPARSE ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance( optimized_model, DEEPSPARSE_INFERENCE_LEARNERS[output_library] ) assert isinstance(optimized_model.get_size(), int) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance( loaded_model, DEEPSPARSE_INFERENCE_LEARNERS[output_library] ) inputs_example = optimized_model.get_inputs_example() res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model test_input_data, ys = input_data.get_split("test").get_list( with_ys=True ) validity_check_op = MetricDropMeasure() validity_check_op.execute( optimized_model, test_input_data, model_outputs, CONSTRAINED_METRIC_DROP_THS, metric_func=metric if quantization_type is not None else compute_relative_difference, ys=ys, ) # Check validity of the optimized model assert validity_check_op.get_result() # Dynamic batch size is currently not supported from deepsparse # if dynamic: # inputs_example = [ # input_[: len(input_) // 2] for input_ in inputs_example # ] # res = model(*inputs_example) # assert res is not None ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_intel_neural_compressor.py ================================================ from tempfile import TemporaryDirectory import pytest import torch from nebullvm.core.models import ( DeviceType, Device, QuantizationType, DeepLearningFramework, ModelCompiler, ) from nebullvm.operations.inference_learners.neural_compressor import ( NEURAL_COMPRESSOR_INFERENCE_LEARNERS, ) from nebullvm.operations.optimizations.compilers.intel_neural_compressor import ( # noqa: E501 IntelNeuralCompressorCompiler, ) from nebullvm.operations.optimizations.compilers.utils import ( intel_neural_compressor_is_available, ) from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import ( initialize_model, check_model_validity, ) from nebullvm.operations.inference_learners.utils import load_model device = Device(DeviceType.CPU) @pytest.mark.parametrize( ("output_library", "dynamic", "metric_drop_ths", "quantization_type"), [ (DeepLearningFramework.PYTORCH, True, 2, QuantizationType.DYNAMIC), (DeepLearningFramework.PYTORCH, False, 2, QuantizationType.DYNAMIC), (DeepLearningFramework.PYTORCH, True, 2, QuantizationType.STATIC), (DeepLearningFramework.PYTORCH, False, 2, QuantizationType.STATIC), ], ) @pytest.mark.skipif( not intel_neural_compressor_is_available(), reason="Can't test neural compressor if it's not installed.", ) def test_neural_compressor( output_library: DeepLearningFramework, dynamic: bool, metric_drop_ths: float, quantization_type: QuantizationType, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, None, output_library, device) compiler_op = IntelNeuralCompressorCompiler() compiler_op.to(device).execute( model=model, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.INTEL_NEURAL_COMPRESSOR ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance( optimized_model, NEURAL_COMPRESSOR_INFERENCE_LEARNERS[output_library], ) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance( loaded_model, NEURAL_COMPRESSOR_INFERENCE_LEARNERS[output_library] ) assert isinstance(optimized_model.get_size(), int) inputs_example = optimized_model.get_inputs_example() res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: inputs_example = [ input_[: len(input_) // 2] for input_ in inputs_example ] res = model(*inputs_example) assert res is not None res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose(res_tensor, res_orig_tensor, rtol=1e-01) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_onnxruntime.py ================================================ import sys from pathlib import Path from tempfile import TemporaryDirectory import onnx import pytest import torch from nebullvm.core.models import ( Device, DeviceType, DeepLearningFramework, QuantizationType, ModelCompiler, ) from nebullvm.operations.conversions.converters import PytorchConverter from nebullvm.operations.inference_learners.onnx import ONNX_INFERENCE_LEARNERS from nebullvm.operations.optimizations.compilers.onnxruntime import ( ONNXCompiler, ) from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import ( initialize_model, check_model_validity, ) from nebullvm.operations.inference_learners.utils import load_model from nebullvm.tools.utils import gpu_is_available device = ( Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU) ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", "external_data_format", ), [ (DeepLearningFramework.PYTORCH, True, None, None, None, True), (DeepLearningFramework.PYTORCH, True, None, None, None, False), (DeepLearningFramework.PYTORCH, False, None, None, None, False), ], ) def test_onnxruntime( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, external_data_format: bool, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) model_path = Path(tmp_dir) / "fp32" model_path.mkdir(parents=True) converter_op = PytorchConverter() converter_op.to(device).set_state(model, input_data).execute( model_path, model_params ) converted_models = converter_op.get_result() assert len(converted_models) > 1 model_path = str( [model for model in converted_models if isinstance(model, Path)][0] ) # Test onnx external data format (large models) if external_data_format: onnx_model = onnx.load(model_path) onnx.save_model( onnx_model, model_path, save_as_external_data=True, all_tensors_to_one_file=False, ) compiler_op = ONNXCompiler() compiler_op.to(device).execute( model=model_path, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.ONNX_RUNTIME ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, quantization_type=quantization_type, ) optimized_model = build_inference_learner_op.get_result() assert isinstance( optimized_model, ONNX_INFERENCE_LEARNERS[output_library] ) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance( loaded_model, ONNX_INFERENCE_LEARNERS[output_library] ) assert isinstance(optimized_model.get_size(), int) inputs_example = list(optimized_model.get_inputs_example()) res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size torch_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) inputs_example = [ input_[: len(input_) // 2].to(torch_device) for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None with torch.inference_mode(): res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose(res_tensor, res_orig_tensor, rtol=2e-01) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", "external_data_format", ), [ ( DeepLearningFramework.PYTORCH, True, QuantizationType.DYNAMIC, 2, "numeric_precision", False, ), ( DeepLearningFramework.PYTORCH, True, QuantizationType.STATIC, 2, "numeric_precision", False, ), ], ) @pytest.mark.skipif( torch.cuda.is_available(), reason="onnxruntime with int8 precision is very slow on GPU", ) def test_onnxruntime_quantization( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, external_data_format: bool, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) model_path = Path(tmp_dir) / "fp32" model_path.mkdir(parents=True) converter_op = PytorchConverter() converter_op.to(device).set_state(model, input_data).execute( model_path, model_params ) converted_models = converter_op.get_result() assert len(converted_models) > 1 model_path = str( [model for model in converted_models if isinstance(model, Path)][0] ) # Test onnx external data format (large models) if external_data_format: onnx_model = onnx.load(model_path) onnx.save_model( onnx_model, model_path, save_as_external_data=True, all_tensors_to_one_file=False, ) compiler_op = ONNXCompiler() compiler_op.to(device).execute( model=model_path, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.ONNX_RUNTIME ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, quantization_type=quantization_type, ) optimized_model = build_inference_learner_op.get_result() assert isinstance( optimized_model, ONNX_INFERENCE_LEARNERS[output_library] ) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance( loaded_model, ONNX_INFERENCE_LEARNERS[output_library] ) assert isinstance(optimized_model.get_size(), int) inputs_example = list(optimized_model.get_inputs_example()) res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size torch_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) inputs_example = [ input_[: len(input_) // 2].to(torch_device) for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None with torch.inference_mode(): res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose(res_tensor, res_orig_tensor, rtol=2e-01) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", "external_data_format", ), [ ( DeepLearningFramework.PYTORCH, True, QuantizationType.HALF, 2, "numeric_precision", False, ), ( DeepLearningFramework.PYTORCH, True, QuantizationType.HALF, 2, "numeric_precision", True, ), ], ) @pytest.mark.skipif( sys.platform == "win32", reason="onnxruntime with half precision on windows does not work", ) @pytest.mark.skipif( not torch.cuda.is_available(), reason="onnxruntime with half precision is very slow on CPU", ) def test_onnxruntime_half( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, external_data_format: bool, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) model_path = Path(tmp_dir) / "fp32" model_path.mkdir(parents=True) converter_op = PytorchConverter() converter_op.to(device).set_state(model, input_data).execute( model_path, model_params ) converted_models = converter_op.get_result() assert len(converted_models) > 1 model_path = str( [model for model in converted_models if isinstance(model, Path)][0] ) # Test onnx external data format (large models) if external_data_format: onnx_model = onnx.load(model_path) onnx.save_model( onnx_model, model_path, save_as_external_data=True, all_tensors_to_one_file=False, ) compiler_op = ONNXCompiler() compiler_op.to(device).execute( model=model_path, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.ONNX_RUNTIME ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, quantization_type=quantization_type, ) optimized_model = build_inference_learner_op.get_result() assert isinstance( optimized_model, ONNX_INFERENCE_LEARNERS[output_library] ) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = ONNX_INFERENCE_LEARNERS[output_library].load(tmp_dir) assert isinstance( loaded_model, ONNX_INFERENCE_LEARNERS[output_library] ) assert isinstance(optimized_model.get_size(), int) inputs_example = list(optimized_model.get_inputs_example()) res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size torch_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) inputs_example = [ input_[: len(input_) // 2].to(torch_device) for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None with torch.inference_mode(): res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose( res_tensor.float(), res_orig_tensor, rtol=1e-01 ) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_openvino.py ================================================ from pathlib import Path from tempfile import TemporaryDirectory import cpuinfo import pytest import torch from nebullvm.core.models import ( DeepLearningFramework, QuantizationType, Device, DeviceType, ModelCompiler, ) from nebullvm.operations.conversions.converters import PytorchConverter from nebullvm.operations.inference_learners.openvino import ( OPENVINO_INFERENCE_LEARNERS, ) from nebullvm.operations.optimizations.compilers.openvino import ( OpenVINOCompiler, ) from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import ( initialize_model, check_model_validity, ) from nebullvm.operations.inference_learners.utils import load_model @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ (DeepLearningFramework.PYTORCH, True, None, None, None), (DeepLearningFramework.PYTORCH, False, None, None, None), ( DeepLearningFramework.PYTORCH, False, QuantizationType.HALF, 2, "numeric_precision", ), ( DeepLearningFramework.PYTORCH, False, QuantizationType.STATIC, 2, "numeric_precision", ), ( DeepLearningFramework.PYTORCH, True, QuantizationType.STATIC, 2, "numeric_precision", ), ], ) @pytest.mark.skipif( "intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(), reason="Openvino is only available for intel processors.", ) def test_openvino( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): device = Device(DeviceType.CPU) with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) model_path = Path(tmp_dir) / "fp32" model_path.mkdir(parents=True) converter_op = PytorchConverter() converter_op.to(device).set_state(model, input_data).execute( model_path, model_params ) converted_models = converter_op.get_result() assert len(converted_models) > 1 model_path = str( [model for model in converted_models if isinstance(model, Path)][0] ) compiler_op = OpenVINOCompiler() compiler_op.to(device).execute( model=model_path, model_params=model_params, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.OPENVINO ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance( optimized_model, OPENVINO_INFERENCE_LEARNERS[output_library] ) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance( loaded_model, OPENVINO_INFERENCE_LEARNERS[output_library] ) assert isinstance(optimized_model.get_size(), int) inputs_example = list(optimized_model.get_inputs_example()) res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size inputs_example = [ input_[: len(input_) // 2] for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose( res_tensor.float(), res_orig_tensor, rtol=2e-01 ) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tensor_rt.py ================================================ from pathlib import Path from tempfile import TemporaryDirectory import pytest import torch from nebullvm.core.models import ( Device, DeviceType, DeepLearningFramework, QuantizationType, ModelCompiler, ) from nebullvm.operations.conversions.converters import PytorchConverter from nebullvm.operations.inference_learners.tensor_rt import ( TENSOR_RT_INFERENCE_LEARNERS, PytorchTensorRTInferenceLearner, ) from nebullvm.operations.optimizations.compilers.tensor_rt import ( ONNXTensorRTCompiler, PyTorchTensorRTCompiler, ) from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import ( initialize_model, check_model_validity, ) from nebullvm.operations.inference_learners.utils import load_model from nebullvm.tools.utils import check_module_version device = Device(DeviceType.GPU) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ (DeepLearningFramework.PYTORCH, True, None, None, None), (DeepLearningFramework.PYTORCH, False, None, None, None), ( DeepLearningFramework.PYTORCH, False, QuantizationType.HALF, 2, "numeric_precision", ), ( DeepLearningFramework.PYTORCH, False, QuantizationType.STATIC, 2, "numeric_precision", ), ], ) @pytest.mark.skipif( not torch.cuda.is_available(), reason="Skip because cuda is not available.", ) def test_tensorrt_onnx( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) model_path = Path(tmp_dir) / "fp32" model_path.mkdir(parents=True) converter_op = PytorchConverter() converter_op.to(device).set_state(model, input_data).execute( model_path, model_params ) converted_models = converter_op.get_result() assert len(converted_models) > 1 model_path = str( [model for model in converted_models if isinstance(model, Path)][0] ) compiler_op = ONNXTensorRTCompiler() compiler_op.to(device).execute( model=model_path, model_params=model_params, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.TENSOR_RT_ONNX ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance( optimized_model, TENSOR_RT_INFERENCE_LEARNERS[output_library] ) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance( loaded_model, TENSOR_RT_INFERENCE_LEARNERS[output_library] ) assert isinstance(optimized_model.get_size(), int) inputs_example = tuple(optimized_model.get_inputs_example()) res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: torch_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) inputs_example = [ input_[: len(input_) // 2].to(torch_device) for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None with torch.inference_mode(): res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose( res_tensor.float(), res_orig_tensor, rtol=1e-01 ) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ (DeepLearningFramework.PYTORCH, True, None, None, None), (DeepLearningFramework.PYTORCH, False, None, None, None), ( DeepLearningFramework.PYTORCH, False, QuantizationType.HALF, 2, "numeric_precision", ), ( DeepLearningFramework.PYTORCH, False, QuantizationType.STATIC, 2, "numeric_precision", ), ], ) @pytest.mark.skipif( not torch.cuda.is_available(), reason="Skip because cuda is not available.", ) @pytest.mark.skipif( not check_module_version(torch, max_version="1.13.1+cu117"), reason="Skip because torch version is not supported.", ) def test_tensorrt_torch( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) compiler_op = PyTorchTensorRTCompiler() compiler_op.to(device).execute( model=model, model_params=model_params, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.TENSOR_RT_TORCH ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance(optimized_model, PytorchTensorRTInferenceLearner) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = PytorchTensorRTInferenceLearner.load(tmp_dir) assert isinstance(loaded_model, PytorchTensorRTInferenceLearner) assert isinstance(optimized_model.get_size(), int) inputs_example = tuple(optimized_model.get_inputs_example()) res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size torch_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) inputs_example = [ input_[: len(input_) // 2].to(torch_device) for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose( res_tensor.float(), res_orig_tensor, rtol=1e-01 ) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tensorflow.py ================================================ from tempfile import TemporaryDirectory import pytest from nebullvm.core.models import ( DeepLearningFramework, QuantizationType, Device, DeviceType, ModelCompiler, ) from nebullvm.operations.inference_learners.tensorflow import ( TensorflowBackendInferenceLearner, TFLiteBackendInferenceLearner, ) from nebullvm.operations.optimizations.compilers.tensorflow import ( TensorflowBackendCompiler, TFLiteBackendCompiler, ) from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import ( initialize_model, check_model_validity, ) from nebullvm.operations.inference_learners.utils import load_model from nebullvm.tools.utils import gpu_is_available @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ (DeepLearningFramework.TENSORFLOW, False, None, None, None), (DeepLearningFramework.TENSORFLOW, True, None, None, None), ], ) def test_tensorflow_backend( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): device = ( Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU) ) with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) compiler_op = TensorflowBackendCompiler() compiler_op.to(device).execute( model=model, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.XLA ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance(optimized_model, TensorflowBackendInferenceLearner) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance(loaded_model, TensorflowBackendInferenceLearner) assert isinstance(optimized_model.get_size(), int) inputs_example = list(optimized_model.get_inputs_example()) res = optimized_model.predict(*inputs_example) assert res is not None # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size inputs_example = [ input_[: len(input_) // 2] for input_ in inputs_example ] res = optimized_model.predict(*inputs_example) assert res is not None @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ ( DeepLearningFramework.TENSORFLOW, False, None, 0.1, "numeric_precision", ), ( DeepLearningFramework.TENSORFLOW, True, None, 0.1, "numeric_precision", ), ( DeepLearningFramework.TENSORFLOW, True, QuantizationType.DYNAMIC, 2, "numeric_precision", ), ( DeepLearningFramework.TENSORFLOW, True, QuantizationType.HALF, 2, "numeric_precision", ), ( DeepLearningFramework.TENSORFLOW, True, QuantizationType.STATIC, 2, "numeric_precision", ), ], ) def test_tf_lite( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): device = Device(DeviceType.CPU) with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) compiler_op = TFLiteBackendCompiler() compiler_op.to(device).execute( model=model, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.TFLITE ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance(optimized_model, TFLiteBackendInferenceLearner) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = TFLiteBackendInferenceLearner.load(tmp_dir) assert isinstance(loaded_model, TFLiteBackendInferenceLearner) assert isinstance(optimized_model.get_size(), int) inputs_example = list(optimized_model.get_inputs_example()) res = optimized_model.predict(*inputs_example) assert res is not None # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size inputs_example = [ input_[: len(input_) // 2] for input_ in inputs_example ] res = optimized_model.predict(*inputs_example) assert res is not None ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_torch_dynamo.py ================================================ import platform from tempfile import TemporaryDirectory import pytest import torch from nebullvm.core.models import ( DeviceType, Device, DeepLearningFramework, QuantizationType, ModelCompiler, ) from nebullvm.operations.inference_learners.torch_dynamo import ( TorchDynamoInferenceLearner, ) from nebullvm.operations.optimizations.compilers.torch_dynamo import ( TorchDynamoCompiler, ) from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import ( initialize_model, check_model_validity, ) from nebullvm.tools.utils import gpu_is_available, check_module_version device = ( Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU) ) def run_test_torch_dynamo( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): with TemporaryDirectory() as tmp_dir: # noqa: F841 ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) compiler_op = TorchDynamoCompiler() compiler_op.to(device).execute( model=model, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, model_params=model_params, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.TORCH_DYNAMO ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance(optimized_model, TorchDynamoInferenceLearner) # Test save and load functions # optimized_model.save(tmp_dir) # loaded_model = load_model(tmp_dir) # assert isinstance(loaded_model, TorchDynamoInferenceLearner) assert isinstance(optimized_model.get_size(), int) inputs_example = list(optimized_model.get_inputs_example()) res = optimized_model(*inputs_example) assert res is not None # res_loaded = loaded_model(*inputs_example) # assert all( # [ # torch.allclose(res_tensor, res_loaded_tensor) # for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) # ] # ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size torch_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) inputs_example = [ input_[: len(input_) // 2].to(torch_device) for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose( res_tensor.float(), res_orig_tensor, rtol=2e-01 ) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ (DeepLearningFramework.PYTORCH, True, None, None, None), (DeepLearningFramework.PYTORCH, False, None, None, None), ], ) @pytest.mark.skipif( not check_module_version(torch, min_version="2.0.0"), reason="Torch version is not supported", ) @pytest.mark.skipif( platform.system() == "Windows", reason="Torch compile() is not currently supported on windows", ) def test_torch_dynamo_fp32( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): run_test_torch_dynamo( output_library, dynamic, quantization_type, metric_drop_ths, metric, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_torchscript.py ================================================ from tempfile import TemporaryDirectory import pytest import torch from nebullvm.core.models import ( DeviceType, Device, DeepLearningFramework, QuantizationType, ModelCompiler, ) from nebullvm.operations.inference_learners.torchscript import ( TorchScriptInferenceLearner, ) from nebullvm.operations.optimizations.compilers.torchscript import ( TorchScriptCompiler, ) from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import ( initialize_model, check_model_validity, ) from nebullvm.operations.inference_learners.utils import load_model from nebullvm.tools.utils import gpu_is_available device = ( Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU) ) def run_test_torchscript( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) compiler_op = TorchScriptCompiler() compiler_op.to(device).execute( model=model, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.TORCHSCRIPT ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance(optimized_model, TorchScriptInferenceLearner) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance(loaded_model, TorchScriptInferenceLearner) assert isinstance(optimized_model.get_size(), int) inputs_example = list(optimized_model.get_inputs_example()) res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: # Check also with a smaller bath_size torch_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) inputs_example = [ input_[: len(input_) // 2].to(torch_device) for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose( res_tensor.float(), res_orig_tensor, rtol=2e-01 ) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ (DeepLearningFramework.PYTORCH, True, None, None, None), (DeepLearningFramework.PYTORCH, False, None, None, None), ], ) def test_torchscript_no_quantization( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): run_test_torchscript( output_library, dynamic, quantization_type, metric_drop_ths, metric, ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ ( DeepLearningFramework.PYTORCH, True, QuantizationType.HALF, 2, "numeric_precision", ) ], ) @pytest.mark.skipif( not torch.cuda.is_available(), reason="Half quantization is not available on CPU", ) def test_torchscript_half_quantization( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): run_test_torchscript( output_library, dynamic, quantization_type, metric_drop_ths, metric, ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ ( DeepLearningFramework.PYTORCH, True, QuantizationType.DYNAMIC, 2, "numeric_precision", ), ( DeepLearningFramework.PYTORCH, True, QuantizationType.STATIC, 2, "numeric_precision", ), ], ) @pytest.mark.skipif( torch.cuda.is_available(), reason="INT8 quantization is not available on GPU", ) def test_torchscript_int8_quantization( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): run_test_torchscript( output_library, dynamic, quantization_type, metric_drop_ths, metric, ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tvm.py ================================================ from pathlib import Path from tempfile import TemporaryDirectory import pytest import torch from nebullvm.core.models import ( Device, DeviceType, DeepLearningFramework, QuantizationType, ModelCompiler, ) from nebullvm.operations.conversions.converters import PytorchConverter from nebullvm.operations.inference_learners.tvm import ( PytorchApacheTVMInferenceLearner, ) from nebullvm.operations.optimizations.compilers.tvm import ( ONNXApacheTVMCompiler, PyTorchApacheTVMCompiler, ) from nebullvm.operations.optimizations.compilers.utils import tvm_is_available from nebullvm.operations.optimizations.optimizers.base import ( COMPILER_TO_INFERENCE_LEARNER_MAP, ) from nebullvm.operations.optimizations.tests.utils import ( initialize_model, check_model_validity, ) from nebullvm.operations.inference_learners.utils import load_model from nebullvm.tools.utils import gpu_is_available device = ( Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU) ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ (DeepLearningFramework.PYTORCH, True, None, None, None), (DeepLearningFramework.PYTORCH, False, None, None, None), ( DeepLearningFramework.PYTORCH, True, QuantizationType.DYNAMIC, 2, "numeric_precision", ), ( DeepLearningFramework.PYTORCH, True, QuantizationType.HALF, 2, "numeric_precision", ), # ( # DeepLearningFramework.PYTORCH, # True, # QuantizationType.STATIC, # 2, # "numeric_precision", # ), ], ) @pytest.mark.skipif( not tvm_is_available(), reason="Apache TVM is not installed" ) def test_tvm_onnx( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) model_path = Path(tmp_dir) / "fp32" model_path.mkdir(parents=True) converter_op = PytorchConverter() converter_op.to(device).set_state(model, input_data).execute( model_path, model_params ) converted_models = converter_op.get_result() assert len(converted_models) > 1 model_path = str( [model for model in converted_models if isinstance(model, Path)][0] ) compiler_op = ONNXApacheTVMCompiler() compiler_op.to(device).execute( model=model_path, model_params=model_params, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.APACHE_TVM_ONNX ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance(loaded_model, PytorchApacheTVMInferenceLearner) assert isinstance(optimized_model.get_size(), int) inputs_example = optimized_model.get_inputs_example() res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) if dynamic: inputs_example = [ input_[: len(input_) // 2] for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose( res_tensor.float(), res_orig_tensor, rtol=1e-01 ) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) @pytest.mark.parametrize( ( "output_library", "dynamic", "quantization_type", "metric_drop_ths", "metric", ), [ (DeepLearningFramework.PYTORCH, True, None, None, None), (DeepLearningFramework.PYTORCH, False, None, None, None), ( DeepLearningFramework.PYTORCH, True, QuantizationType.DYNAMIC, 2, "numeric_precision", ), ( DeepLearningFramework.PYTORCH, True, QuantizationType.HALF, 2, "numeric_precision", ), # ( # DeepLearningFramework.PYTORCH, # True, # QuantizationType.STATIC, # 2, # "numeric_precision", # ), ], ) @pytest.mark.skipif( not tvm_is_available(), reason="Can't test tvm if it's not installed." ) def test_tvm_torch( output_library: DeepLearningFramework, dynamic: bool, quantization_type: QuantizationType, metric_drop_ths: int, metric: str, ): with TemporaryDirectory() as tmp_dir: ( model, input_data, model_params, input_tfms, model_outputs, metric, ) = initialize_model(dynamic, metric, output_library, device) compiler_op = PyTorchApacheTVMCompiler() compiler_op.to(device).execute( model=model, model_params=model_params, input_tfms=input_tfms, metric_drop_ths=metric_drop_ths, quantization_type=quantization_type, input_data=input_data, ) compiled_model = compiler_op.get_result() build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[ ModelCompiler.APACHE_TVM_TORCH ]() build_inference_learner_op.to(device).execute( model=compiled_model, model_orig=compiler_op.model_orig if hasattr(compiler_op, "model_orig") else None, model_params=model_params, input_tfms=input_tfms, source_dl_framework=output_library, ) optimized_model = build_inference_learner_op.get_result() assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner) # Test save and load functions optimized_model.save(tmp_dir) loaded_model = PytorchApacheTVMInferenceLearner.load(tmp_dir) assert isinstance(loaded_model, PytorchApacheTVMInferenceLearner) assert isinstance(optimized_model.get_size(), int) inputs_example = optimized_model.get_inputs_example() res = optimized_model(*inputs_example) assert res is not None res_loaded = loaded_model(*inputs_example) assert all( [ torch.allclose(res_tensor, res_loaded_tensor) for (res_tensor, res_loaded_tensor) in zip(res, res_loaded) ] ) # Test validity of the model valid = check_model_validity( optimized_model, input_data, model_outputs, metric_drop_ths, quantization_type, metric, ) assert valid if dynamic: inputs_example = [ input_[: len(input_) // 2] for input_ in inputs_example ] res = optimized_model(*inputs_example) assert res is not None res_orig = tuple(model(*inputs_example)) assert all( [ torch.allclose( res_tensor.float(), res_orig_tensor, rtol=1e-01 ) for (res_tensor, res_orig_tensor) in zip(res, res_orig) ] ) ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/utils.py ================================================ import os from pathlib import Path from typing import Any, Callable, Optional, Tuple import tensorflow as tf import tensorflow.keras as keras import torch from tensorflow.keras import Model, layers from transformers import AlbertModel, AlbertTokenizer from nebullvm.config import TRAIN_TEST_SPLIT_RATIO, CONSTRAINED_METRIC_DROP_THS from nebullvm.core.models import ( DeepLearningFramework, ModelParams, DataType, DeviceType, Device, QuantizationType, ) from nebullvm.operations.conversions.huggingface import convert_hf_model from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx from nebullvm.operations.measures.measures import ( LatencyOriginalModelMeasure, MetricDropMeasure, ) from nebullvm.operations.measures.utils import compute_relative_difference from nebullvm.tools.data import DataManager from nebullvm.tools.transformations import MultiStageTransformation from nebullvm.tools.utils import gpu_is_available, extract_info_from_data INPUT_SHAPE = (3, 256, 256) OUTPUT_SHAPE = (2,) STATIC_BATCH_SIZE = 1 DYNAMIC_BATCH_SIZE = 2 class TestModel(torch.nn.Module): def __init__(self): super().__init__() self.conv1 = torch.nn.Conv2d( in_channels=3, out_channels=64, kernel_size=3 ) self.relu1 = torch.nn.ReLU() self.conv2 = torch.nn.Conv2d( in_channels=64, out_channels=32, kernel_size=3 ) self.relu2 = torch.nn.ReLU() self.fcn = torch.nn.Linear(32, 2) def forward(self, input_tensor_0, input_tensor_1): x0 = self.relu2(self.conv2(self.relu1(self.conv1(input_tensor_0)))) x1 = self.relu2(self.conv2(self.relu1(self.conv1(input_tensor_1)))) x = x0 + x1 x = self.fcn(x.mean(dim=(-2, -1)).view(-1, 32)) return x def tensorflow_model(): input_0 = keras.Input(shape=(256, 256, 3)) input_1 = keras.Input(shape=(256, 256, 3)) x0 = layers.Conv2D(64, kernel_size=(3, 3), activation="relu")(input_0) x1 = layers.Conv2D(64, kernel_size=(3, 3), activation="relu")(input_1) x0 = layers.Conv2D(32, kernel_size=(3, 3), activation="relu")(x0) x1 = layers.Conv2D(32, kernel_size=(3, 3), activation="relu")(x1) x = x0 + x1 y = layers.Dense(2, activation="softmax")(x) return Model(inputs=[input_0, input_1], outputs=y) def _build_static_model( framework: DeepLearningFramework = DeepLearningFramework.PYTORCH, ) -> Tuple[torch.nn.Module, ModelParams]: model_params = { "batch_size": STATIC_BATCH_SIZE, "input_infos": [ {"size": (STATIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"}, {"size": (STATIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"}, ], "output_sizes": [ (STATIC_BATCH_SIZE, *OUTPUT_SHAPE), ], "output_types": [DataType.FLOAT32], } model_params = ModelParams(**model_params) if framework == DeepLearningFramework.PYTORCH: model = TestModel() elif framework == DeepLearningFramework.TENSORFLOW: model = tensorflow_model() else: raise NotImplementedError return model, model_params def _build_dynamic_model( framework: DeepLearningFramework, ) -> Tuple[torch.nn.Module, ModelParams]: model_params = { "batch_size": DYNAMIC_BATCH_SIZE, "input_infos": [ {"size": (DYNAMIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"}, {"size": (DYNAMIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"}, ], "output_sizes": [ (DYNAMIC_BATCH_SIZE, *OUTPUT_SHAPE), ], "output_types": [DataType.FLOAT32], "dynamic_info": { "inputs": [ { 0: { "name": "batch", "min_val": 1, "opt_val": 1, "max_val": 2, } }, { 0: { "name": "batch", "min_val": 1, "opt_val": 1, "max_val": 2, } }, ], "outputs": [{0: "batch"}], }, } if framework == DeepLearningFramework.PYTORCH: model = TestModel() elif framework == DeepLearningFramework.TENSORFLOW: model = tensorflow_model() else: raise NotImplementedError() return model, ModelParams(**model_params) def get_torch_model(dynamic: bool = False): if dynamic: model, model_params = _build_dynamic_model( DeepLearningFramework.PYTORCH ) else: model, model_params = _build_static_model( DeepLearningFramework.PYTORCH ) return model, model_params def get_tensorflow_model(dynamic: bool = False): if dynamic: model, model_params = _build_dynamic_model( DeepLearningFramework.TENSORFLOW ) else: model, model_params = _build_static_model( DeepLearningFramework.TENSORFLOW ) return model, model_params def get_huggingface_model(temp_dir: str, dl_framework: DeepLearningFramework): tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") model = AlbertModel.from_pretrained("albert-base-v1") text = "Short text you wish to process" encoded_input = tokenizer(text, return_tensors="pt") device = ( Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU) ) ( model, input_data, input_names, output_structure, output_type, ) = convert_hf_model(model, [encoded_input], device=device) input_data = DataManager(input_data) input_data.split(TRAIN_TEST_SPLIT_RATIO) # Benchmark original model benchmark_orig_model_op = LatencyOriginalModelMeasure() benchmark_orig_model_op.to(device).execute( model=model, input_data=input_data.get_split("test"), dl_framework=dl_framework, ) model_outputs = benchmark_orig_model_op.get_result()[0] model_path = os.path.join(temp_dir, "test_model.onnx") model_params = extract_info_from_data( model, input_data, dl_framework, None, device ) device = DeviceType.GPU if gpu_is_available() else DeviceType.CPU convert_torch_to_onnx( model, input_data, model_params, Path(model_path), device ) return ( model_path, model_params, output_structure, input_names, output_type, input_data, model_outputs, ) def initialize_model( dynamic: bool, metric: Optional[str], output_library: DeepLearningFramework, device: Device, ): torch_device = torch.device( "cuda" if device.type is DeviceType.GPU else "cpu" ) batch_size = DYNAMIC_BATCH_SIZE if dynamic else STATIC_BATCH_SIZE if output_library == DeepLearningFramework.PYTORCH: model, model_params = get_torch_model(dynamic) input_data = DataManager( [ ( ( torch.randn(batch_size, *INPUT_SHAPE).to(torch_device), torch.randn(batch_size, *INPUT_SHAPE).to(torch_device), ), torch.zeros(batch_size, dtype=torch.long), ) ] ) elif output_library == DeepLearningFramework.TENSORFLOW: model, model_params = get_tensorflow_model(dynamic) input_data = DataManager( [ ( ( tf.random_normal_initializer()( shape=( batch_size, *INPUT_SHAPE[1:], INPUT_SHAPE[0], ) ), tf.random_normal_initializer()( shape=( batch_size, *INPUT_SHAPE[1:], INPUT_SHAPE[0], ) ), ), [0 for _ in range(batch_size)], ) ] ) input_data.split(TRAIN_TEST_SPLIT_RATIO) input_tfms = MultiStageTransformation([]) # Benchmark original model benchmark_orig_model_op = LatencyOriginalModelMeasure() benchmark_res = benchmark_orig_model_op.to(device).execute( model=model, input_data=input_data.get_split("test"), dl_framework=output_library, ) model_outputs = benchmark_res.model_outputs if metric is not None: metric = compute_relative_difference return model, input_data, model_params, input_tfms, model_outputs, metric def check_model_validity( optimized_model: Any, input_data: DataManager, model_outputs: Any, metric_drop_ths: float, quantization_type: QuantizationType, metric: Callable, ) -> bool: test_input_data, ys = input_data.get_split("test").get_list(with_ys=True) validity_check_op = MetricDropMeasure() validity_check_op.execute( optimized_model, test_input_data, model_outputs, metric_drop_ths if metric_drop_ths is not None else CONSTRAINED_METRIC_DROP_THS, metric_func=metric if quantization_type is not None else compute_relative_difference, ys=ys, ) print(validity_check_op.get_result()[1]) return validity_check_op.get_result()[0] ================================================ FILE: optimization/nebullvm/nebullvm/operations/optimizations/utils.py ================================================ from typing import Callable, List def map_compilers_and_compressors(ignore_list: List, enum_class: Callable): if ignore_list is None: ignore_list = [] else: ignore_list = [enum_class(element) for element in ignore_list] return ignore_list ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/blade_disc.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import torch_blade except ImportError: torch_blade = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/deepsparse.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: from deepsparse import compile_model, cpu except ImportError: compile_model = cpu = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/diffusers.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import diffusers # noqa F401 from diffusers import ( StableDiffusionPipeline, DiffusionPipeline, ) # noqa F401 from diffusers.models import ( AutoencoderKL, UNet2DConditionModel, ) # noqa F401 from diffusers.models.unet_2d import UNet2DOutput # noqa F401 except ImportError: diffusers = DummyClass StableDiffusionPipeline = DummyClass DiffusionPipeline = DummyClass UNet2DConditionModel = DummyClass AutoencoderKL = DummyClass UNet2DOutput = DummyClass try: import onnx_graphsurgeon # noqa F401 except ImportError: onnx_graphsurgeon = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/dummy.py ================================================ class DummyClass: pass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/huggingface.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: from transformers import PreTrainedModel, CLIPTextModel, CLIPTokenizer from transformers.tokenization_utils import PreTrainedTokenizer from transformers.models.bert.modeling_bert import ( BertModel, BertEmbeddings, BertEncoder, BertPooler, BertPreTrainedModel, ) from transformers import BertConfig, GPT2Tokenizer, GPT2LMHeadModel except ImportError: # add placeholders for function definition PreTrainedModel = DummyClass CLIPTextModel = DummyClass CLIPTokenizer = DummyClass PreTrainedTokenizer = DummyClass BertModel = DummyClass BertEmbeddings = DummyClass BertEncoder = DummyClass BertPooler = DummyClass BertPreTrainedModel = DummyClass BertConfig = DummyClass GPT2Tokenizer = DummyClass GPT2LMHeadModel = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/neural_compressor.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import neural_compressor # noqa F401 from neural_compressor.adaptor.pytorch import ( _cfg_to_qconfig as cfg_to_qconfig, _cfgs_to_fx_cfgs as cfgs_to_fx_cfgs, ) from neural_compressor.experimental import ( MixedPrecision, Quantization, Pruning, ) except ImportError: cfg_to_qconfig = cfgs_to_fx_cfgs = None MixedPrecision = Quantization = Pruning = DummyClass except ValueError: # MacOS cfg_to_qconfig = cfgs_to_fx_cfgs = None MixedPrecision = Quantization = Pruning = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/onnx.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import onnx # noqa F401 except ImportError: onnx = DummyClass try: import onnxmltools # noqa F401 from onnxmltools.utils.float16_converter import ( # noqa F401 convert_float_to_float16_model_path, ) except ImportError: convert_float_to_float16_model_path = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/onnxruntime.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import onnxruntime # noqa F401 from onnxruntime.quantization import ( QuantType, quantize_static, quantize_dynamic, CalibrationDataReader, ) except ImportError: onnxruntime = DummyClass setattr(onnxruntime, "SessionOptions", None) QuantType = quantize_static = quantize_dynamic = None CalibrationDataReader = DummyClass except FileNotFoundError: # Solves a colab issue QuantType = quantize_static = quantize_dynamic = None CalibrationDataReader = DummyClass try: # They require torch from onnxruntime.transformers import optimizer from onnxruntime.transformers.optimizer import MODEL_TYPES except ImportError: MODEL_TYPES = DummyClass optimizer = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/onnxsim.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import onnxsim except ImportError: onnxsim = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/openvino.py ================================================ import logging from nebullvm.optional_modules.dummy import DummyClass try: from openvino.runtime import Core, Model, CompiledModel, InferRequest from openvino.tools.pot import DataLoader from openvino.tools.pot import IEEngine from openvino.tools.pot import load_model, save_model from openvino.tools.pot import compress_model_weights from openvino.tools.pot import create_pipeline except ImportError: Model = CompiledModel = InferRequest = Core = DummyClass DataLoader = IEEngine = DummyClass load_model = save_model = compress_model_weights = create_pipeline = None # Fix openvino issue with logging # It adds a second handler to the root logger that cause issues if len(logging.getLogger().handlers) > 1: logging.getLogger().removeHandler(logging.getLogger().handlers[-1]) ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/tensor_rt.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import tensorrt from tensorrt import IInt8EntropyCalibrator2 except ImportError: tensorrt = DummyClass IInt8EntropyCalibrator2 = DummyClass try: import polygraphy.cuda as polygraphy from polygraphy.logger import G_LOGGER G_LOGGER.module_severity = 40 from polygraphy.backend.onnx.loader import fold_constants except ImportError: polygraphy = DummyClass fold_constants = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/tensorflow.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import absl.logging absl.logging.set_verbosity(absl.logging.ERROR) except Exception: pass class Keras: Model = DummyClass class data: Dataset = DummyClass class dtypes: DType = DummyClass class Tensorflow: Module = DummyClass Tensor = DummyClass keras = Keras() data = data dtypes = dtypes float16 = float32 = int32 = int64 = DummyClass @staticmethod def function(**kwargs): return lambda x: x try: import tensorflow # noqa F401 physical_devices = tensorflow.config.experimental.list_physical_devices( "GPU" ) if len(physical_devices) > 0: for physical_device in physical_devices: tensorflow.config.experimental.set_memory_growth( physical_device, True ) tensorflow.get_logger().setLevel("ERROR") tensorflow.autograph.set_verbosity(0) except (ImportError, AttributeError): tensorflow = Tensorflow try: import tf2onnx # noqa F401 tf2onnx.logging.set_level("ERROR") tf2onnx.logging.set_tf_verbosity("ERROR") except ImportError: tf2onnx = object ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/torch.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import torch # noqa F401 from torch.nn import Module # noqa F401 from torch.jit import ScriptModule # noqa F401 from torch.fx import GraphModule from torch.utils.data import DataLoader, Dataset # noqa F401 from torch.quantization.quantize_fx import ( # noqa F401 prepare_fx, convert_fx, ) from torch.ao.quantization.stubs import QuantStub, DeQuantStub from torch.fx import symbolic_trace from torch.quantization import default_dynamic_qconfig import torch.distributed as torch_distributed except ImportError: class nn: Module = DummyClass class jit: ScriptModule = DummyClass class fx: GraphModule = DummyClass class torch: float = half = int8 = DummyClass float16 = float32 = int32 = int64 = DummyClass Tensor = DummyClass dtype = DummyClass nn = nn jit = jit Generator = DummyClass FloatTensor = DummyClass fx = fx @staticmethod def no_grad(): return lambda x: None @staticmethod def inference_mode(): return lambda x: None Dataset = DummyClass Module = DummyClass ScriptModule = DummyClass GraphModule = DummyClass DataLoader = DummyClass symbolic_trace = None QuantStub = DeQuantStub = DummyClass default_dynamic_qconfig = prepare_fx = convert_fx = None Generator = DummyClass FloatTensor = DummyClass torch_distributed = None ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/torch_neuron.py ================================================ import logging from nebullvm.optional_modules.dummy import DummyClass try: import torch_neuron # noqa F401 logging.getLogger("Neuron").setLevel(logging.WARNING) except ImportError: try: import torch_neuronx # noqa F401 logging.getLogger("Neuron").setLevel(logging.WARNING) except ImportError: torch_neuron = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/torch_tensorrt.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import torch_tensorrt from torch_tensorrt.ptq import DataLoaderCalibrator # noqa F401 except ImportError: torch_tensorrt = DummyClass DataLoaderCalibrator = None ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/torch_xla.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import torch_xla import torch_xla.core.xla_model as xm except ImportError: torch_xla = DummyClass xm = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/tvm.py ================================================ from nebullvm.optional_modules.dummy import DummyClass try: import tvm from tvm import IRModule from tvm.runtime.ndarray import NDArray from tvm.autotvm.tuner import XGBTuner from tvm import autotvm import tvm.relay as relay from tvm.relay.transform import ToMixedPrecision from tvm.contrib.graph_executor import GraphModule from tvm.runtime import Module from tvm.relay.backend.executor_factory import ExecutorFactoryModule except ImportError: tvm = ( IRModule ) = ( NDArray ) = ( XGBTuner ) = ( ExecutorFactoryModule ) = autotvm = relay = ToMixedPrecision = GraphModule = Module = DummyClass ================================================ FILE: optimization/nebullvm/nebullvm/optional_modules/utils.py ================================================ import cpuinfo from loguru import logger from nebullvm.core.models import Device, DeviceType from nebullvm.operations.optimizations.compilers.utils import ( bladedisc_is_available, deepsparse_is_available, faster_transformer_is_available, intel_neural_compressor_is_available, onnxruntime_is_available, openvino_is_available, tensorrt_is_available, torch_tensorrt_is_available, torch_neuron_is_available, torch_xla_is_available, tvm_is_available, ) from nebullvm.tools.utils import gpu_is_available, check_module_version def torch_is_available() -> bool: try: import torch # noqa F401 if not torch.cuda.is_available() and gpu_is_available(): logger.warning( "Installed PyTorch does not have cuda support. " "Please ensure that torch.cuda.is_available() " "returns True by installing the proper version " "of PyTorch. " ) if not check_module_version(torch, min_version="1.10.0"): logger.warning( "torch module version must be >= 1.10.0. " "Please update it if you want to use it." ) return False except ImportError: return False else: return True def tensorflow_is_available() -> bool: try: import tensorflow # noqa F401 if not check_module_version(tensorflow, min_version="2.7.0"): logger.warning( "tensorflow module version must be >= 2.7.0. " "Please update it if you want to use it." ) return False except ImportError: return False else: return True def onnx_is_available() -> bool: try: import onnx # noqa F401 if not check_module_version(onnx, min_version="1.10.0"): logger.warning( "onnx module version must be >= 1.10.0. " "Please update it if you want to use it." ) return False return True except ImportError: return False def _onnxmltools_is_available(): try: import onnxmltools # noqa F401 if not check_module_version(onnxmltools, min_version="1.11.0"): logger.warning( "onnxmltools module version must be >= 1.11.0. " "Please update it if you want to use the ONNX API " "or the ONNX pipeline for PyTorch and Tensorflow." ) return False else: return True except ImportError: return False def _onnxsim_is_available(): try: import onnxsim # noqa F401 return True except ImportError: return False def _polygraphy_is_available(): try: import polygraphy.cuda # noqa F401 return True except ImportError: return False def tf2onnx_is_available(): try: import tf2onnx # noqa F401 return True except ImportError: return False def check_dependencies(device: Device): missing_frameworks = [] missing_suggested_compilers = [] missing_optional_compilers = [] missing_dependencies = [] processor = cpuinfo.get_cpu_info()["brand_raw"].lower() if device.type is DeviceType.TPU: if not torch_is_available(): missing_frameworks.append("torch") if not torch_xla_is_available(): missing_dependencies.append("torch_xla") elif device.type is DeviceType.NEURON: if not torch_is_available(): missing_frameworks.append("torch") if not torch_neuron_is_available(): missing_dependencies.append("torch_neuron") else: if not onnx_is_available(): missing_frameworks.append("onnx") if not tvm_is_available(): missing_optional_compilers.append("tvm") if not onnxruntime_is_available(): missing_suggested_compilers.append("onnxruntime") elif not _onnxmltools_is_available(): missing_dependencies.append("onnxmltools") if not faster_transformer_is_available(): missing_optional_compilers.append("faster_transformer") if device.type is DeviceType.GPU: if not tensorrt_is_available(): missing_suggested_compilers.append("tensorrt") else: if not _onnxsim_is_available(): missing_dependencies.append("onnxsim") elif not _polygraphy_is_available(): missing_dependencies.append("polygraphy") if device.type is DeviceType.CPU: if not openvino_is_available() and "intel" in processor: missing_suggested_compilers.append("openvino") if torch_is_available(): if not tvm_is_available(): if "tvm" not in missing_optional_compilers: missing_optional_compilers.append("tvm") if not bladedisc_is_available(): missing_optional_compilers.append("torch_blade") if device.type is DeviceType.CPU: if not deepsparse_is_available() and "intel" in processor: missing_suggested_compilers.append("deepsparse") if ( not intel_neural_compressor_is_available() and "intel" in processor ): missing_suggested_compilers.append("neural_compressor") elif device.type is DeviceType.GPU: if not torch_tensorrt_is_available: missing_suggested_compilers.append("torch_tensorrt") else: missing_frameworks.append("torch") if tensorflow_is_available(): if not tf2onnx_is_available(): missing_dependencies.append("tf2onnx") else: missing_frameworks.append("tensorflow") missing_frameworks = ", ".join(missing_frameworks) if len(missing_frameworks) > 0: logger.warning( f"Missing Frameworks: {missing_frameworks}.\n " f"Please install them " "to include them in the optimization pipeline." ) missing_suggested_compilers = ", ".join(missing_suggested_compilers) if len(missing_suggested_compilers) > 0: logger.warning( f"Missing Compilers: {missing_suggested_compilers}.\n " f"Please install them " "to include them in the optimization pipeline." ) missing_dependencies = ", ".join(missing_dependencies) if len(missing_dependencies) > 0: logger.warning( f"Missing Dependencies: {missing_dependencies}.\n " f"Without them, some compilers " f"may not work properly." ) ================================================ FILE: optimization/nebullvm/nebullvm/tools/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/tools/adapters.py ================================================ import abc import copy from abc import abstractmethod import time from typing import List, Any, Union from loguru import logger from nebullvm.core.models import ( Device, DeviceType, OptimizedModel, OriginalModel, ) from nebullvm.operations.conversions.huggingface import convert_hf_model from nebullvm.operations.inference_learners.base import ( BaseInferenceLearner, ) from nebullvm.operations.inference_learners.huggingface import ( DiffusionInferenceLearner, ) from nebullvm.optional_modules.diffusers import StableDiffusionPipeline from nebullvm.optional_modules.torch import torch from nebullvm.tools.diffusers import ( get_unet_inputs, preprocess_diffusers, postprocess_diffusers, ) from nebullvm.tools.pytorch import get_torch_model_size from nebullvm.tools.utils import ( is_huggingface_data, check_module_version, get_throughput, ) class ModelAdapter(abc.ABC): @property @abstractmethod def adapted_model(self): pass @property @abstractmethod def adapted_data(self): pass @abstractmethod def adapt_inference_learner( self, optimized_model: OptimizedModel ) -> BaseInferenceLearner: pass @abstractmethod def adapt_original_model( self, original_model: OriginalModel ) -> OriginalModel: pass class DiffusionAdapter(ModelAdapter): def __init__( self, original_pipeline: StableDiffusionPipeline, data: List, device: Device, ): self.original_pipeline = copy.deepcopy(original_pipeline) self.original_data = data self.device = device self.__adapted = False self.__df_model = None self.__df_data = None @torch.no_grad() def __benchmark_pipeline( self, pipe: Union[StableDiffusionPipeline, BaseInferenceLearner], num_warmup_steps=2, num_steps=3, ): # Warmup for i in range(num_warmup_steps): _ = pipe(self.original_data[i % len(self.original_data)]).images[0] start = time.time() # Benchmark for i in range(num_steps): _ = pipe(self.original_data[i % len(self.original_data)]).images[0] took = time.time() - start return took / num_steps def __adapt(self): if not check_module_version(torch, max_version="1.13.1+cu117"): raise ValueError( "Diffusion models are only supported in PyTorch " "versions <= 1.13.1. Please downgrade your PyTorch " "version and try again." ) model = copy.deepcopy(self.original_pipeline) model.get_unet_inputs = get_unet_inputs model.to(self.device.to_torch_format()) self.__df_data = [ ( tuple( d.reshape((1,)) if d.shape == torch.Size([]) else d for d in model.get_unet_inputs( model, prompt=prompt, ) if d is not None ), None, ) for prompt in self.original_data ] self.__df_model = preprocess_diffusers(model) self.__adapted = True @property def adapted_model(self): if self.__adapted is False: self.__adapt() return self.__df_model @property def adapted_data(self): if self.__adapted is False: self.__adapt() return self.__df_data def adapt_inference_learner( self, optimized_model: OptimizedModel ) -> OptimizedModel: pipe = copy.deepcopy(self.original_pipeline) pipe.to(self.device.to_torch_format()) if self.device.type is DeviceType.GPU: try: pipe.enable_xformers_memory_efficient_attention() except Exception: pass pipe = postprocess_diffusers( optimized_model.inference_learner, pipe, self.device, ) logger.info("Benchmarking optimized pipeline...") optimized_model.latency_seconds = self.__benchmark_pipeline(pipe) optimized_model.throughput = get_throughput( optimized_model.latency_seconds ) optimized_model.inference_learner = DiffusionInferenceLearner(pipe) optimized_model.size_mb += ( sum( [ get_torch_model_size(v) for (k, v) in pipe.__dict__.items() if isinstance(v, torch.nn.Module) and k != "unet" ] ) / 1e6 ) return optimized_model def adapt_original_model( self, original_model: OriginalModel ) -> OriginalModel: pipe = copy.deepcopy(self.original_pipeline) pipe.to(self.device.to_torch_format()) logger.info("Benchmarking original pipeline...") original_model.latency_seconds = self.__benchmark_pipeline(pipe) original_model.throughput = get_throughput( original_model.latency_seconds ) original_model.size_mb += ( sum( [ get_torch_model_size(v) for (k, v) in pipe.__dict__.items() if isinstance(v, torch.nn.Module) and k != "unet" ] ) / 1e6 ) return original_model class HuggingFaceAdapter(ModelAdapter): def __init__(self, model: Any, data: List, device: Device, **kwargs): self.original_model = model self.original_data = data self.device = device self.tokenizer_params = kwargs self.__adapted = False self.__hf_model = None self.__hf_data = None self.__hf_input_names = None self.__hf_output_type = None self.__hf_output_structure = None def __adapt_model(self): if not is_huggingface_data(self.original_data[0]): raise ValueError("Cannot convert non-HuggingFace data") ( model, data, input_names, output_structure, output_type, ) = convert_hf_model( self.original_model, self.original_data, self.device, **self.tokenizer_params, ) self.__hf_model = model self.__hf_data = data self.__hf_input_names = input_names self.__hf_output_type = output_type self.__hf_output_structure = output_structure self.__adapted = True @property def adapted_model(self): if self.__adapted is False: self.__adapt_model() return self.__hf_model @property def adapted_data(self): if self.__adapted is False: self.__adapt_model() return self.__hf_data def adapt_inference_learner( self, optimized_model: OptimizedModel ) -> OptimizedModel: from nebullvm.operations.inference_learners.huggingface import ( HuggingFaceInferenceLearner, ) optimized_model.inference_learner = HuggingFaceInferenceLearner( core_inference_learner=optimized_model.inference_learner, output_structure=self.__hf_output_structure, input_names=self.__hf_input_names, output_type=self.__hf_output_type, ) return optimized_model def adapt_original_model( self, original_model: OriginalModel ) -> OriginalModel: return original_model ================================================ FILE: optimization/nebullvm/nebullvm/tools/benchmark.py ================================================ import time from abc import abstractmethod, ABC from typing import Any, Dict, Type import numpy as np from loguru import logger from tqdm import tqdm from nebullvm.core.models import DeepLearningFramework, ModelParams, DeviceType from nebullvm.operations.inference_learners.base import BaseInferenceLearner from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch, DataLoader from nebullvm.tools.data import DataManager from nebullvm.tools.onnx import create_model_inputs_onnx from nebullvm.tools.pytorch import create_model_inputs_torch from nebullvm.tools.tf import create_model_inputs_tf from nebullvm.tools.utils import ( check_input_data, extract_info_from_data, is_data_subscriptable, check_device, ) def _get_dl_framework(model: Any): if ( isinstance(model, torch.nn.Module) or str(model).startswith("Pytorch") or str(model).startswith("Torch") ): return DeepLearningFramework.PYTORCH elif (isinstance(model, tf.Module) and model is not None) or str( model ).startswith("Tensorflow"): return DeepLearningFramework.TENSORFLOW elif isinstance(model, str) or str(model).startswith("Numpy"): return DeepLearningFramework.NUMPY else: raise TypeError(f"Model type {type(model)} not supported.") def _create_model_inputs( dl_framework: DeepLearningFramework, model_params: ModelParams ): if dl_framework == DeepLearningFramework.PYTORCH: input_data = create_model_inputs_torch(model_params.input_infos) elif dl_framework == DeepLearningFramework.TENSORFLOW: input_data = create_model_inputs_tf(model_params.input_infos) elif dl_framework == DeepLearningFramework.NUMPY: input_data = create_model_inputs_onnx(model_params.input_infos) else: raise TypeError(f"Unknown framework {dl_framework}") return input_data class BaseBenchmark(ABC): def __init__(self, model, input_tensors, device, n_warmup=50, n_runs=1000): self.model = model self.input_tensors = input_tensors self.device = device self.n_warmup = n_warmup self.n_runs = n_runs @abstractmethod def benchmark(self): raise NotImplementedError class PytorchBenchmark(BaseBenchmark): def benchmark(self): input_tensors = [ [tensor.to(self.device.to_torch_format()) for tensor in tensors] for tensors in self.input_tensors ] batch_size = input_tensors[0][0].shape[0] if isinstance(self.model, torch.nn.Module): self.model.to(self.device.to_torch_format()).eval() with torch.no_grad(): for i in tqdm( range(self.n_warmup), desc=f"Performing warm up on {self.n_warmup} iterations", ): self.model( *input_tensors[i % min(self.n_warmup, len(input_tensors))] ) if self.device.type is DeviceType.GPU: torch.cuda.synchronize() timings = [] with torch.no_grad(): for i in tqdm( range(1, self.n_runs + 1), desc=f"Performing benchmark on {self.n_runs} iterations", ): start_time = time.time() self.model( *input_tensors[i % min(self.n_runs, len(input_tensors))] ) if self.device.type is DeviceType.GPU: torch.cuda.synchronize() end_time = time.time() timings.append(end_time - start_time) print(f"Batch size: {batch_size}") throughput = batch_size / np.mean(timings) latency = np.mean(timings) / batch_size print("Average Throughput: %.2f data/second" % throughput) print("Average Latency: %.4f seconds/data" % latency) return throughput, latency class TensorflowBenchmark(BaseBenchmark): def benchmark(self): batch_size = self.input_tensors[0][0].shape[0] for i in tqdm( range(self.n_warmup), desc=f"Performing warm up on {self.n_warmup} iterations", ): with tf.device(self.device.to_tf_format()): self.model( *self.input_tensors[ i % min(self.n_warmup, len(self.input_tensors)) ] ) timings = [] for i in tqdm( range(1, self.n_runs + 1), desc=f"Performing benchmark on {self.n_runs} iterations", ): start_time = time.time() with tf.device(self.device.to_tf_format()): self.model( *self.input_tensors[ i % min(self.n_runs, len(self.input_tensors)) ] ) end_time = time.time() timings.append(end_time - start_time) print(f"Batch size: {batch_size}") throughput = batch_size / np.mean(timings) latency = np.mean(timings) / batch_size print("Average Throughput: %.2f data/second" % throughput) print("Average Latency: %.4f seconds/data" % latency) return throughput, latency class NumpyBenchmark(BaseBenchmark): def benchmark(self): if not isinstance(self.model, BaseInferenceLearner): # TODO: Add support for original onnx models raise NotImplementedError( "Benchmark function doesn't support original " "onnx models." ) batch_size = self.input_tensors[0][0].shape[0] for i in tqdm( range(self.n_warmup), desc=f"Performing warm up on {self.n_warmup} iterations", ): self.model( *self.input_tensors[ i % min(self.n_warmup, len(self.input_tensors)) ] ) timings = [] for i in tqdm( range(1, self.n_runs + 1), desc=f"Performing benchmark on {self.n_runs} iterations", ): start_time = time.time() self.model( *self.input_tensors[ i % min(self.n_runs, len(self.input_tensors)) ] ) end_time = time.time() timings.append(end_time - start_time) print(f"Batch size: {batch_size}") throughput = batch_size / np.mean(timings) latency = np.mean(timings) / batch_size print("Average Throughput: %.2f data/second" % throughput) print("Average Latency: %.4f seconds/data" % latency) return throughput, latency def benchmark( model, input_data, device=None, random=False, n_warmup=50, n_runs=1000 ): """Performs a Benchmark on the input model regardless of the framework it was used for implementing it. Args: model (Any): The input model. input_data (Iterable or Sequence): Input data to be used for optimizing the model. PyTorch, TensorFlow and Onnx respectively accept input tensor in `torch.Tensor`, `tf.Tensor` and `np.ndarray` formats. Note that the each input sample must be a tuple containing a tuple as first element, the `inputs`, and the `label` as second element. The `inputs` needs to be passed as tuple even if a single input is needed by the model (in this case the `inputs` tuple will contain just an element). HuggingFace models can take as data samples both dictionaries or strings. Strings will then be converted in data samples using the HuggingFace tokenizer which must be given as input when just a list of string is provided as input_data (tokenizers can be passed as extra arguments of this function using the keyword `tokenizer`). device (str): Device to be used for running the benchmark. If None, CPU will be used. Default: None. random (bool, optional): If set to true, the data used to benchmark the model will be computed randomly given the info extracted from the provided input_data. n_warmup (int, optional): Number of warmup iterations. n_runs (int, optional): Number of iterations performed to benchmark the model. """ if not isinstance(model, BaseInferenceLearner): device = check_device(device) else: device = model.device logger.info(f"Running benchmark on {device.type.name}") dl_framework = _get_dl_framework(model) if isinstance(input_data, (DataLoader, tf.data.Dataset)): try: input_data = DataManager.from_dataloader(input_data) except Exception: raise ValueError( "The provided dataloader does not match the expected " "format.\n" "Speedster supports dataloaders that return tuples in " "the\n" "following formats: \n" "Single input: (input, label)\n" "Multiple inputs: ((input1, input2, ...), label) or " "(input1, input2, ..., label)\n" "Inputs and labels should be either tensors or numpy " "arrays,\n" "depending on the framework used.\n" ) if not isinstance(input_data, DataManager): if check_input_data(input_data): if is_data_subscriptable(input_data): input_data = DataManager(input_data) else: input_data = DataManager.from_iterable(input_data) else: raise ValueError( "The provided data does not match the expected " "format.\n" "Speedster supports data in the following formats: \n" "- PyTorch DataLoader\n" "- TensorFlow Dataset\n" "- List of tuples: [((input_0, ... ), label), ...] \n" "Inputs and labels should be either tensors or numpy " "arrays,\n" "depending on the framework used.\n" ) if random: model_params = extract_info_from_data( model, input_data, dl_framework, None, device ) input_data = _create_model_inputs(dl_framework, model_params) else: input_data = input_data.get_list() BENCHMARK_FUNCTIONS[dl_framework]( model=model, input_tensors=input_data, device=device, n_warmup=n_warmup, n_runs=n_runs, ).benchmark() BENCHMARK_FUNCTIONS: Dict[DeepLearningFramework, Type[BaseBenchmark]] = { DeepLearningFramework.PYTORCH: PytorchBenchmark, DeepLearningFramework.TENSORFLOW: TensorflowBenchmark, DeepLearningFramework.NUMPY: NumpyBenchmark, } ================================================ FILE: optimization/nebullvm/nebullvm/tools/data.py ================================================ from typing import Sequence, List, Tuple, Any, Union, Iterable import numpy as np from loguru import logger from nebullvm.config import MIN_DIM_INPUT_DATA from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch, Dataset, DataLoader from nebullvm.tools.onnx import convert_to_numpy class DataManager: """Class for managing the user data in nebullvm. Attributes: data_reader(Sequence): Object implementing the __getitem__, the __len__ and the __iter__/__next__ APIs. It should read the user data and return tuples of tensors for feeding the models. """ def __init__(self, data_reader: Sequence): self._data_reader = data_reader self._pointer = 0 self.train_idxs = [] self.test_idxs = [] def __getitem__(self, item): return self._data_reader[item] def __len__(self): return len(self._data_reader) def __iter__(self): self._pointer = 0 return self def __next__(self): if self._pointer < len(self): data = self[self._pointer] self._pointer += 1 return data else: raise StopIteration def get_numpy_list( self, n: int = None, shuffle: bool = False, with_ys: bool = False ) -> Union[ List[Tuple[np.ndarray, ...]], Tuple[List[Tuple[np.ndarray, ...]], List] ]: if n is None: n = len(self) if not with_ys: return [ tuple(convert_to_numpy(x) for x in tuple_) for tuple_ in self.get_list(n, shuffle) ] else: xs, ys = self.get_list(n, shuffle, with_ys=True) return [ tuple(convert_to_numpy(x) for x in tuple_) for tuple_ in xs ], ys def get_list( self, n: int = None, shuffle: bool = False, with_ys: bool = False ) -> Union[List[Tuple[Any, ...]], Tuple[List[Tuple[Any, ...]], List]]: if n is None: n = len(self) if shuffle: idx = np.random.choice(len(self), n, replace=n > len(self)) else: idx = np.arange(0, min(n, len(self))) if n > len(self): np.random.seed(0) idx = np.concatenate( [ idx, np.random.choice( len(self), n - len(self), replace=True ), ] ) if not with_ys: return [self[i][0] for i in idx] ys, xs = [], [] for i in idx: x, y = self[i] if len(self[i]) > 1 else (self[i][0], None) xs.append(x) ys.append(y) return xs, ys @classmethod def from_iterable(cls, iterable: Iterable, max_length: int = 500): return cls([x for i, x in enumerate(iterable) if i < max_length]) @classmethod def from_dataloader( cls, dataloader: Union[DataLoader, tf.data.Dataset], max_length: int = 500, ): batch_size = ( dataloader.batch_size if isinstance(dataloader, DataLoader) else dataloader._batch_size ) if batch_size > max_length: raise ValueError( f"Batch size ({dataloader.batch_size}) is greater than " f"max_length ({max_length})." ) data_manager = [] warning_label = False for i, batch in enumerate(dataloader): if i * batch_size >= max_length: break if isinstance(batch, (list, tuple)): if len(batch) == 1: data_manager.append((batch, None)) elif len(batch) == 2: if isinstance(batch[0], tuple): data_manager.append((batch[0], batch[1])) elif isinstance(batch[0], (torch.Tensor, tf.Tensor)): warning_label = True data_manager.append(((batch[0],), batch[1])) else: raise ValueError( "The first element of the batch should be a " "tuple or a torch.Tensor" ) else: warning_label = True data_manager.append( (tuple(t for t in batch[:-1]), batch[-1]) ) elif isinstance(batch, (torch.Tensor, tf.Tensor)): data_manager.append(((batch,), None)) else: raise ValueError( "The batch should be a tuple, a list or a Tensor" ) if warning_label: logger.warning( "The provided dataloader returns a tuple of tensors" "for each batch. The last tensor in the tuple will " "be considered as the label. " "To avoid this warning, the dataloader should return " "a tuple for each batch, where the first element is " "a tuple containing the inputs and the second element " "is a tensor containing the label." ) return cls(data_manager) def get_split(self, split_type="train"): return ( DataManager([self[i] for i in self.train_idxs]) if split_type == "train" else DataManager([self[i] for i in self.test_idxs]) ) def split(self, split_pct: float, shuffle: bool = False): if shuffle: idx = np.random.choice(len(self), len(self), replace=False) else: idx = np.arange(len(self)) n = int(round(len(idx) * split_pct)) if len(self) < MIN_DIM_INPUT_DATA: logger.warning( f"Not enough data for splitting the DataManager. " f"You should provide at least {MIN_DIM_INPUT_DATA} " f"data samples to allow a good split between train " f"and test sets. Compression, calibration and precision " f"checks will use the same data." ) self.train_idxs = idx self.test_idxs = idx else: self.train_idxs = idx[:n] self.test_idxs = idx[n:] class PytorchDataset(Dataset): def __init__(self, input_data: DataManager, has_labels: bool = False): self.data = input_data self.has_labels = has_labels self.batch_size = input_data[0][0][0].shape[0] def __len__(self): return sum([batch_inputs[0].shape[0] for batch_inputs, _ in self.data]) def __getitem__(self, idx): batch_idx = int(idx / self.batch_size) item_idx = idx % self.batch_size data = tuple([data[item_idx] for data in self.data[batch_idx][0]]) if self.has_labels: label = self.data[batch_idx][1] if label is not None: return data, self.data[batch_idx][1][item_idx] else: return data, torch.tensor([0]) else: return data ================================================ FILE: optimization/nebullvm/nebullvm/tools/diffusers.py ================================================ # Based on https://github.com/NVIDIA/TensorRT/blob/main/demo/Diffusion/models.py # # # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from typing import Dict, Union, List, Optional, Any, Tuple from nebullvm.core.models import Device from nebullvm.optional_modules.diffusers import ( DiffusionPipeline, UNet2DConditionModel, UNet2DOutput, AutoencoderKL, onnx_graphsurgeon as gs, ) from nebullvm.optional_modules.diffusers import StableDiffusionPipeline from nebullvm.optional_modules.huggingface import CLIPTextModel, CLIPTokenizer from nebullvm.optional_modules.onnx import onnx from nebullvm.optional_modules.tensor_rt import fold_constants from nebullvm.optional_modules.torch import torch @torch.no_grad() def get_unet_inputs( self, prompt: Union[str, List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 1, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds, ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] device = self._execution_device do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt prompt_embeds = self._encode_prompt( prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps = self.scheduler.timesteps # 5. Prepare latent variables num_channels_latents = self.unet.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, height, width, prompt_embeds.dtype, device, generator, latents, ) for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance latent_model_input = ( torch.cat([latents] * 2) if do_classifier_free_guidance else latents ) latent_model_input = self.scheduler.scale_model_input( latent_model_input, t ) return latent_model_input, t, prompt_embeds, cross_attention_kwargs class DiffusionUNetWrapper(torch.nn.Module): def __init__(self, model): super().__init__() self.model = model def forward(self, *x, **kwargs): return tuple( self.model(x[0], x[1], encoder_hidden_states=x[2]).values() ) class OptimizedDiffusionWrapper(torch.nn.Module): def __init__(self, model): super().__init__() self.model = model def forward(self, *x, **kwargs): return UNet2DOutput( self.model( x[0], x[1].reshape((1,)) if x[1].shape == torch.Size([]) else x[1], kwargs["encoder_hidden_states"], )[0] ) def is_diffusion_model_pipe(model): return isinstance(model, DiffusionPipeline) def get_default_dynamic_info(input_shape: List[Tuple[int, ...]]): return { "inputs": [ { 0: { "name": "2B", "min_val": input_shape[0][0], "opt_val": input_shape[0][0], "max_val": input_shape[0][0], }, 2: { "name": "H", "min_val": input_shape[0][2], "opt_val": input_shape[0][2], "max_val": input_shape[0][2], }, 3: { "name": "W", "min_val": input_shape[0][3], "opt_val": input_shape[0][3], "max_val": input_shape[0][3], }, }, {}, { 0: { "name": "2B", "min_val": input_shape[2][0], "opt_val": input_shape[2][0], "max_val": input_shape[2][0], } }, ], "outputs": [{0: "2B", 2: "H", 3: "W"}], } def preprocess_diffusers(pipe: DiffusionPipeline) -> torch.nn.Module: # Function that wraps the Diffusion UNet model to # be compatible with the optimizations performed by nebullvm model = DiffusionUNetWrapper(pipe.unet) return model def postprocess_diffusers( optimized_model: Any, pipe: StableDiffusionPipeline, device: Device, ) -> StableDiffusionPipeline: # Function that puts the optimized Diffusion UNet model back # into the Diffusion Pipeline final_model = OptimizedDiffusionWrapper(optimized_model) final_model.sample_size = pipe.unet.sample_size final_model.in_channels = pipe.unet.in_channels final_model.device = torch.device(device.to_torch_format()) final_model.config = pipe.unet.config final_model.in_channels = pipe.unet.in_channels pipe.unet = final_model return pipe class Optimizer: def __init__(self, onnx_graph, verbose=False): self.graph = gs.import_onnx(onnx_graph) self.verbose = verbose def info(self, prefix): if self.verbose: print( f"{prefix} .. {len(self.graph.nodes)} nodes, {len(self.graph.tensors().keys())} tensors, {len(self.graph.inputs)} inputs, {len(self.graph.outputs)} outputs" ) def cleanup(self, return_onnx=False): self.graph.cleanup().toposort() if return_onnx: return gs.export_onnx(self.graph) def select_outputs(self, keep, names=None): self.graph.outputs = [self.graph.outputs[o] for o in keep] if names: for i, name in enumerate(names): self.graph.outputs[i].name = name def fold_constants(self, return_onnx=False): onnx_graph = fold_constants( gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True, ) self.graph = gs.import_onnx(onnx_graph) if return_onnx: return onnx_graph def infer_shapes(self, return_onnx=False): onnx_graph = gs.export_onnx(self.graph) if onnx_graph.ByteSize() > 2147483648: raise TypeError("ERROR: model size exceeds supported 2GB limit") else: onnx_graph = onnx.shape_inference.infer_shapes(onnx_graph) self.graph = gs.import_onnx(onnx_graph) if return_onnx: return onnx_graph def get_path(version, inpaint=False): if version == "1.4": if inpaint: return "runwayml/stable-diffusion-inpainting" else: return "CompVis/stable-diffusion-v1-4" elif version == "1.5": if inpaint: return "runwayml/stable-diffusion-inpainting" else: return "runwayml/stable-diffusion-v1-5" elif version == "2.0-base": if inpaint: return "stabilityai/stable-diffusion-2-inpainting" else: return "stabilityai/stable-diffusion-2-base" elif version == "2.0": if inpaint: return "stabilityai/stable-diffusion-2-inpainting" else: return "stabilityai/stable-diffusion-2" elif version == "2.1": return "stabilityai/stable-diffusion-2-1" elif version == "2.1-base": return "stabilityai/stable-diffusion-2-1-base" else: raise ValueError(f"Incorrect version {version}") def get_embedding_dim(version): if version in ("1.4", "1.5"): return 768 elif version in ("2.0", "2.0-base", "2.1", "2.1-base"): return 1024 else: raise ValueError(f"Incorrect version {version}") class BaseModel: def __init__( self, hf_token, fp16=False, device="cuda", verbose=False, path="", max_batch_size=16, embedding_dim=768, text_maxlen=77, ): self.name = "SD Model" self.hf_token = hf_token self.fp16 = fp16 self.device = device self.verbose = verbose self.path = path self.min_batch = 1 self.max_batch = max_batch_size self.min_image_shape = 256 # min image resolution: 256x256 self.max_image_shape = 1024 # max image resolution: 1024x1024 self.min_latent_shape = self.min_image_shape // 8 self.max_latent_shape = self.max_image_shape // 8 self.embedding_dim = embedding_dim self.text_maxlen = text_maxlen def get_model(self): pass def get_input_names(self): pass def get_output_names(self): pass def get_dynamic_axes(self): return None def get_sample_input(self, batch_size, image_height, image_width): pass def get_input_profile( self, batch_size, image_height, image_width, static_batch, static_shape ): return None def get_shape_dict(self, batch_size, image_height, image_width): return None def optimize(self, onnx_graph): opt = Optimizer(onnx_graph, verbose=self.verbose) opt.info(self.name + ": original") opt.cleanup() opt.info(self.name + ": cleanup") opt.fold_constants() opt.info(self.name + ": fold constants") opt.infer_shapes() opt.info(self.name + ": shape inference") onnx_opt_graph = opt.cleanup(return_onnx=True) opt.info(self.name + ": finished") return onnx_opt_graph def check_dims(self, batch_size, image_height, image_width): assert batch_size >= self.min_batch and batch_size <= self.max_batch assert image_height % 8 == 0 or image_width % 8 == 0 latent_height = image_height // 8 latent_width = image_width // 8 assert ( latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape ) assert ( latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape ) return (latent_height, latent_width) def get_minmax_dims( self, batch_size, image_height, image_width, static_batch, static_shape ): min_batch = batch_size if static_batch else self.min_batch max_batch = batch_size if static_batch else self.max_batch latent_height = image_height // 8 latent_width = image_width // 8 min_image_height = ( image_height if static_shape else self.min_image_shape ) max_image_height = ( image_height if static_shape else self.max_image_shape ) min_image_width = image_width if static_shape else self.min_image_shape max_image_width = image_width if static_shape else self.max_image_shape min_latent_height = ( latent_height if static_shape else self.min_latent_shape ) max_latent_height = ( latent_height if static_shape else self.max_latent_shape ) min_latent_width = ( latent_width if static_shape else self.min_latent_shape ) max_latent_width = ( latent_width if static_shape else self.max_latent_shape ) return ( min_batch, max_batch, min_image_height, max_image_height, min_image_width, max_image_width, min_latent_height, max_latent_height, min_latent_width, max_latent_width, ) class CLIP(BaseModel): def __init__( self, hf_token, device, verbose, path, max_batch_size, embedding_dim ): super(CLIP, self).__init__( hf_token, device=device, verbose=verbose, path=path, max_batch_size=max_batch_size, embedding_dim=embedding_dim, ) self.name = "CLIP" def get_model(self): return CLIPTextModel.from_pretrained( self.path, subfolder="text_encoder", use_auth_token=self.hf_token ).to(self.device) def get_input_names(self): return ["input_ids"] def get_output_names(self): return ["text_embeddings", "pooler_output"] def get_dynamic_axes(self): return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}} def get_input_profile( self, batch_size, image_height, image_width, static_batch, static_shape ): self.check_dims(batch_size, image_height, image_width) min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims( batch_size, image_height, image_width, static_batch, static_shape ) return { "input_ids": [ (min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen), ] } def get_shape_dict(self, batch_size, image_height, image_width): self.check_dims(batch_size, image_height, image_width) return { "input_ids": (batch_size, self.text_maxlen), "text_embeddings": ( batch_size, self.text_maxlen, self.embedding_dim, ), } def get_sample_input(self, batch_size, image_height, image_width): self.check_dims(batch_size, image_height, image_width) return torch.zeros( batch_size, self.text_maxlen, dtype=torch.int32, device=self.device ) def optimize(self, onnx_graph): opt = Optimizer(onnx_graph, verbose=self.verbose) opt.info(self.name + ": original") opt.select_outputs([0]) # delete graph output#1 opt.cleanup() opt.info(self.name + ": remove output[1]") opt.fold_constants() opt.info(self.name + ": fold constants") opt.infer_shapes() opt.info(self.name + ": shape inference") opt.select_outputs( [0], names=["text_embeddings"] ) # rename network output opt.info(self.name + ": remove output[0]") opt_onnx_graph = opt.cleanup(return_onnx=True) opt.info(self.name + ": finished") return opt_onnx_graph def make_CLIP( version, hf_token, device, verbose, max_batch_size, inpaint=False ): return CLIP( hf_token=hf_token, device=device, verbose=verbose, path=get_path(version, inpaint=inpaint), max_batch_size=max_batch_size, embedding_dim=get_embedding_dim(version), ) class UNet(BaseModel): def __init__( self, hf_token, fp16=False, device="cuda", verbose=False, path="", max_batch_size=16, embedding_dim=768, text_maxlen=77, unet_dim=4, ): super(UNet, self).__init__( hf_token, fp16=fp16, device=device, verbose=verbose, path=path, max_batch_size=max_batch_size, embedding_dim=embedding_dim, text_maxlen=text_maxlen, ) self.unet_dim = unet_dim self.name = "UNet" def get_model(self): model_opts = ( {"revision": "fp16", "torch_dtype": torch.float16} if self.fp16 else {} ) return UNet2DConditionModel.from_pretrained( self.path, subfolder="unet", use_auth_token=self.hf_token, **model_opts, ).to(self.device) def get_input_names(self): return ["sample", "timestep", "encoder_hidden_states"] def get_output_names(self): return ["latent"] def get_dynamic_axes(self): return { "sample": {0: "2B", 2: "H", 3: "W"}, "encoder_hidden_states": {0: "2B"}, "latent": {0: "2B", 2: "H", 3: "W"}, } def get_input_profile( self, batch_size, image_height, image_width, static_batch, static_shape ): latent_height, latent_width = self.check_dims( batch_size, image_height, image_width ) ( min_batch, max_batch, _, _, _, _, min_latent_height, max_latent_height, min_latent_width, max_latent_width, ) = self.get_minmax_dims( batch_size, image_height, image_width, static_batch, static_shape ) return { "sample": [ ( 2 * min_batch, self.unet_dim, min_latent_height, min_latent_width, ), (2 * batch_size, self.unet_dim, latent_height, latent_width), ( 2 * max_batch, self.unet_dim, max_latent_height, max_latent_width, ), ], "encoder_hidden_states": [ (2 * min_batch, self.text_maxlen, self.embedding_dim), (2 * batch_size, self.text_maxlen, self.embedding_dim), (2 * max_batch, self.text_maxlen, self.embedding_dim), ], } def get_shape_dict(self, batch_size, image_height, image_width): latent_height, latent_width = self.check_dims( batch_size, image_height, image_width ) return { "sample": ( 2 * batch_size, self.unet_dim, latent_height, latent_width, ), "encoder_hidden_states": ( 2 * batch_size, self.text_maxlen, self.embedding_dim, ), "latent": (2 * batch_size, 4, latent_height, latent_width), } def get_sample_input(self, batch_size, image_height, image_width): latent_height, latent_width = self.check_dims( batch_size, image_height, image_width ) dtype = torch.float16 if self.fp16 else torch.float32 return ( torch.randn( 2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device, ), torch.tensor([1.0], dtype=torch.float32, device=self.device), torch.randn( 2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device, ), ) def make_UNet( version, hf_token, device, verbose, max_batch_size, inpaint=False ): return UNet( hf_token=hf_token, fp16=True, device=device, verbose=verbose, path=get_path(version, inpaint=inpaint), max_batch_size=max_batch_size, embedding_dim=get_embedding_dim(version), unet_dim=(9 if inpaint else 4), ) class VAE(BaseModel): def __init__( self, hf_token, device, verbose, path, max_batch_size, embedding_dim ): super(VAE, self).__init__( hf_token, device=device, verbose=verbose, path=path, max_batch_size=max_batch_size, embedding_dim=embedding_dim, ) self.name = "VAE decoder" def get_model(self): vae = AutoencoderKL.from_pretrained( self.path, subfolder="vae", use_auth_token=self.hf_token ).to(self.device) vae.forward = vae.decode return vae def get_input_names(self): return ["latent"] def get_output_names(self): return ["images"] def get_dynamic_axes(self): return { "latent": {0: "B", 2: "H", 3: "W"}, "images": {0: "B", 2: "8H", 3: "8W"}, } def get_input_profile( self, batch_size, image_height, image_width, static_batch, static_shape ): latent_height, latent_width = self.check_dims( batch_size, image_height, image_width ) ( min_batch, max_batch, _, _, _, _, min_latent_height, max_latent_height, min_latent_width, max_latent_width, ) = self.get_minmax_dims( batch_size, image_height, image_width, static_batch, static_shape ) return { "latent": [ (min_batch, 4, min_latent_height, min_latent_width), (batch_size, 4, latent_height, latent_width), (max_batch, 4, max_latent_height, max_latent_width), ] } def get_shape_dict(self, batch_size, image_height, image_width): latent_height, latent_width = self.check_dims( batch_size, image_height, image_width ) return { "latent": (batch_size, 4, latent_height, latent_width), "images": (batch_size, 3, image_height, image_width), } def get_sample_input(self, batch_size, image_height, image_width): latent_height, latent_width = self.check_dims( batch_size, image_height, image_width ) return torch.randn( batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device, ) def make_VAE( version, hf_token, device, verbose, max_batch_size, inpaint=False ): return VAE( hf_token=hf_token, device=device, verbose=verbose, path=get_path(version, inpaint=inpaint), max_batch_size=max_batch_size, embedding_dim=get_embedding_dim(version), ) class TorchVAEEncoder(torch.nn.Module): def __init__(self, token, device, path): super().__init__() self.path = path self.vae_encoder = AutoencoderKL.from_pretrained( self.path, subfolder="vae", use_auth_token=token ).to(device) def forward(self, x): return self.vae_encoder.encode(x).latent_dist.sample() class VAEEncoder(BaseModel): def __init__( self, hf_token, device, verbose, path, max_batch_size, embedding_dim ): super(VAEEncoder, self).__init__( hf_token, device=device, verbose=verbose, path=path, max_batch_size=max_batch_size, embedding_dim=embedding_dim, ) self.name = "VAE encoder" def get_model(self): vae_encoder = TorchVAEEncoder(self.hf_token, self.device, self.path) return vae_encoder def get_input_names(self): return ["images"] def get_output_names(self): return ["latent"] def get_dynamic_axes(self): return { "images": {0: "B", 2: "8H", 3: "8W"}, "latent": {0: "B", 2: "H", 3: "W"}, } def get_input_profile( self, batch_size, image_height, image_width, static_batch, static_shape ): assert batch_size >= self.min_batch and batch_size <= self.max_batch min_batch = batch_size if static_batch else self.min_batch max_batch = batch_size if static_batch else self.max_batch self.check_dims(batch_size, image_height, image_width) ( min_batch, max_batch, min_image_height, max_image_height, min_image_width, max_image_width, _, _, _, _, ) = self.get_minmax_dims( batch_size, image_height, image_width, static_batch, static_shape ) return { "images": [ (min_batch, 3, min_image_height, min_image_width), (batch_size, 3, image_height, image_width), (max_batch, 3, max_image_height, max_image_width), ], } def get_shape_dict(self, batch_size, image_height, image_width): latent_height, latent_width = self.check_dims( batch_size, image_height, image_width ) return { "images": (batch_size, 3, image_height, image_width), "latent": (batch_size, 4, latent_height, latent_width), } def get_sample_input(self, batch_size, image_height, image_width): self.check_dims(batch_size, image_height, image_width) return torch.randn( batch_size, 3, image_height, image_width, dtype=torch.float32, device=self.device, ) def make_VAEEncoder( version, hf_token, device, verbose, max_batch_size, inpaint=False ): return VAEEncoder( hf_token=hf_token, device=device, verbose=verbose, path=get_path(version, inpaint=inpaint), max_batch_size=max_batch_size, embedding_dim=get_embedding_dim(version), ) def make_tokenizer(version, hf_token): return CLIPTokenizer.from_pretrained( get_path(version), subfolder="tokenizer", use_auth_token=hf_token ) def is_diffusion_model(model) -> bool: try: from diffusers import UNet2DConditionModel except ImportError: return False if is_diffusion_model_pipe(model): return True if isinstance(model, (UNet2DConditionModel, DiffusionUNetWrapper)): return True if hasattr(model, "model"): return isinstance(model.model, UNet2DConditionModel) return False ================================================ FILE: optimization/nebullvm/nebullvm/tools/feedback_collector.py ================================================ import json import os from pathlib import Path from typing import Any import requests from nebullvm.config import VERSION NEBULLVM_METADATA_PATH = Path.home() / ".nebullvm/collect.json" class FeedbackCollector: def __init__( self, url: str, disable_telemetry_environ_var: str, app_version: str ): self._disable_telemetry_environ_var = disable_telemetry_environ_var self._is_active = ( int(os.getenv(disable_telemetry_environ_var, "0")) == 0 ) self._url = url self._metadata = { "nebullvm_version": VERSION, "app_version": app_version, } def _store_ip_address(self): try: self._metadata["ip_address"] = requests.get( "https://api.ipify.org" ).text except Exception: self._metadata["ip_address"] = "Unknown" @property def is_active(self): return self._is_active def _inform_user(self): message = ( f"Nebuly collects anonymous usage statistics to help improve the " f"product. You can opt-out by setting the environment variable " f"{self._disable_telemetry_environ_var}=1." ) print(message) def store_info(self, key: str, value: Any): if key in self._metadata and isinstance(value, list): self._metadata[key] += value else: self._metadata[key] = value def send_feedback(self, timeout: int = 30): if not self.is_active: return {} self._store_ip_address() request_body = self._metadata headers = { "accept": "application/json", "Content-Type": "application/json", } response = requests.post( self._url, data=json.dumps(request_body), headers=headers, timeout=timeout, ) return response def get(self, key: str, default: Any = None): return self._metadata.get(key, default) def reset(self, key: str): self._metadata.pop(key, None) ================================================ FILE: optimization/nebullvm/nebullvm/tools/hardware_utils.py ================================================ import os import platform import cpuinfo import psutil from nebullvm.core.models import HardwareSetup, Device, DeviceType from nebullvm.optional_modules.torch_xla import xm from nebullvm.optional_modules.utils import ( torch_is_available, tensorflow_is_available, ) from nebullvm.tools.pytorch import torch_get_device_name from nebullvm.tools.tf import tensorflow_get_gpu_name from nebullvm.tools.utils import ( gpu_is_available, tpu_is_available, neuron_is_available, ) def get_hw_setup(device: Device = None) -> HardwareSetup: accelerator = None if ( device is not None and device.type is DeviceType.GPU ) or gpu_is_available(): accelerator = _get_gpu_name() elif ( device is not None and device.type is DeviceType.TPU ) or tpu_is_available(): accelerator = _get_tpu_device_name() elif ( device is not None and device.type is DeviceType.NEURON ) or neuron_is_available(): accelerator = _get_neuron_device_name() return HardwareSetup( cpu=cpuinfo.get_cpu_info()["brand_raw"], operating_system=platform.system(), memory_gb=round(psutil.virtual_memory().total * 1e-9, 2), accelerator=accelerator, ) def _get_gpu_name() -> str: if torch_is_available(): name = torch_get_device_name() elif tensorflow_is_available(): name = tensorflow_get_gpu_name() else: name = "Unknown" return name def _get_neuron_device_name() -> str: output = os.popen("lshw -businfo").read() neuron_name = "Unknown Neuron" for line in output.splitlines(): if "neuron" in line.lower(): words = line.split(" ") if len(words) > 2: neuron_name = " ".join(words[-2:]) break return neuron_name def _get_tpu_device_name() -> str: return xm.xla_device_hw(xm.xla_device()) ================================================ FILE: optimization/nebullvm/nebullvm/tools/huggingface.py ================================================ from collections import OrderedDict from typing import ( Union, Iterable, List, Dict, Tuple, Type, Any, ) import numpy as np from nebullvm.core.models import Device, DeviceType from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch, Module try: from transformers import ( PreTrainedModel, ) from transformers.tokenization_utils import PreTrainedTokenizer except ImportError: # add placeholders for function definition PreTrainedModel = None PreTrainedTokenizer = None class PyTorchTransformerWrapper(Module): """Class for wrappering the Transformers and give them an API compatible with nebullvm. The class takes and input of the forward method positional arguments and transform them in the input dictionaries needed by transformers classes. At the end it also flattens their output. """ def __init__( self, core_model: Module, encoded_input: Dict[str, torch.Tensor], ): super().__init__() self.core_model = core_model self.inputs_types = OrderedDict() for key, value in encoded_input.items(): self.inputs_types[key] = value.dtype def forward(self, *args: torch.Tensor): inputs = { key: value for key, value in zip(self.inputs_types.keys(), args) } outputs = self.core_model(**inputs) outputs = outputs.values() if isinstance(outputs, dict) else outputs return tuple(flatten_outputs(outputs)) class TensorFlowTransformerWrapper(tf.keras.Model): def __init__( self, core_model: tf.Module, encoded_input: Dict[str, tf.Tensor], ): super().__init__() self.core_model = core_model self.inputs_types = OrderedDict() for key, value in encoded_input.items(): self.inputs_types[key] = value.dtype def call(self, *args: tf.Tensor): inputs = { key: value for key, value in zip(self.inputs_types.keys(), args[0]) } outputs = self.core_model(**inputs) outputs = outputs.values() if isinstance(outputs, dict) else outputs return tuple(flatten_outputs(list(outputs))) def flatten_outputs( outputs: Union[torch.Tensor, tf.Tensor, Iterable] ) -> List[Union[torch.Tensor, tf.Tensor]]: new_outputs = [] for output in outputs: if isinstance(output, (torch.Tensor, tf.Tensor)): new_outputs.append(output) else: flatten_list = flatten_outputs(output) new_outputs.extend(flatten_list) return new_outputs def get_size_recursively( tensor_tuple: Union[torch.Tensor, tf.Tensor, Tuple] ) -> List[int]: if isinstance(tensor_tuple[0], (torch.Tensor, tf.Tensor)): return [len(tensor_tuple)] else: inner_size = get_size_recursively(tensor_tuple[0]) return [len(tensor_tuple), *inner_size] def get_output_structure_from_text( text: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, tokenizer_args: Dict, device: Device, ) -> Tuple[OrderedDict, Type]: """Function needed for saving in a dictionary the output structure of the transformers model. """ encoded_input = tokenizer([text], **tokenizer_args) if isinstance(model, torch.nn.Module): encoded_input = encoded_input.to(device.to_torch_format()) output = model(**encoded_input) structure = OrderedDict() if isinstance(output, tuple): for i, value in enumerate(output): if isinstance(value, (torch.Tensor, tf.Tensor)): structure[f"output_{i}"] = None else: size = get_size_recursively(value) structure[f"output_{i}"] = size else: for key, value in output.items(): if isinstance(value, (torch.Tensor, tf.Tensor)): structure[key] = None else: size = get_size_recursively(value) structure[key] = size return structure, type(output) def get_output_structure_from_dict( input_example: Dict, model: PreTrainedModel, device: Device, ) -> Tuple[OrderedDict, Type]: """Function needed for saving in a dictionary the output structure of the transformers model. """ if ( isinstance(model, torch.nn.Module) and device.type is not DeviceType.TPU ): model.to(device.to_torch_format()) input_example.to(device.to_torch_format()) output = model(**input_example) structure = OrderedDict() if isinstance(output, tuple): for i, value in enumerate(output): if isinstance(value, (torch.Tensor, tf.Tensor)): structure[f"output_{i}"] = None else: size = get_size_recursively(value) structure[f"output_{i}"] = size else: for key, value in output.items(): if isinstance(value, (torch.Tensor, tf.Tensor)): structure[key] = None else: size = get_size_recursively(value) structure[key] = size return structure, type(output) def restructure_output( output: Tuple[Union[torch.Tensor, tf.Tensor]], structure: OrderedDict, output_type: Any = None, ): """Restructure the flatter output using the structure dictionary given as input. """ output_dict = {} idx = 0 for key, value in structure.items(): if value is None: output_dict[key] = output[idx] idx += 1 else: tensor_shape = output[idx].shape[1:] stack_fn = ( torch.stack if isinstance(output[idx], torch.Tensor) else tf.stack ) reshape_fn = ( torch.reshape if isinstance(output[idx], torch.Tensor) else tf.reshape ) output_dict[key] = list( reshape_fn( stack_fn( output[idx : int(np.prod(value)) + idx] # noqa E203 ), (*value, *tensor_shape), ) ) idx += np.prod(value) if output_type is not None: return output_type(**output_dict) return output_dict ================================================ FILE: optimization/nebullvm/nebullvm/tools/logger.py ================================================ import logging import os import sys import warnings from typing import Any from loguru import logger levels_map = { 0: "ERROR", 1: "WARNING", 2: "INFO", 3: "DEBUG", } def debug_mode_enabled(): return int(os.environ.get("DEBUG_MODE", "0")) > 0 def setup_logger(): if not debug_mode_enabled(): warnings.filterwarnings("ignore") logging_level = int(os.environ.get("NEBULLVM_LOG_LEVEL", "2")) logger.remove() logger.add( sys.stdout, colorize=True, format=( "{time:YYYY-MM-DD HH:mm:ss} | " "{level: <8} | {message}" ), level=levels_map[logging_level], ) logger.level("WARNING", color="") class LoggingContext(object): def __init__( self, logger: logging.Logger, disabled: bool = False, handler: Any = None, close: bool = True, ): self.logger = logger self.disabled = disabled self.handler = handler self.close = close def __enter__(self): self.logger.disabled = self.disabled if self.handler: self.logger.addHandler(self.handler) def __exit__(self, et: Any, ev: Any, tb: Any): if self.disabled is True: self.logger.disabled = False if self.handler: self.logger.removeHandler(self.handler) if self.handler and self.close: self.handler.close() # implicit return of None => don't swallow exceptions ================================================ FILE: optimization/nebullvm/nebullvm/tools/onnx.py ================================================ from typing import List, Tuple, Any, Optional, Dict import numpy as np from loguru import logger from nebullvm.config import ONNX_PROVIDERS from nebullvm.core.models import ( DeepLearningFramework, Device, DeviceType, InputInfo, DataType, ) from nebullvm.optional_modules.onnx import onnx from nebullvm.optional_modules.onnxruntime import onnxruntime as ort from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch def convert_to_numpy(tensor: Any): if isinstance(tensor, torch.Tensor): tensor = tensor.cpu().detach().numpy() elif isinstance(tensor, tf.Tensor) and tensor is not None: tensor = tensor.numpy() elif isinstance(tensor, int): tensor = np.array([tensor]) else: if not isinstance(tensor, np.ndarray): raise TypeError(f"Unsupported data type: {type(tensor)}") return tensor def convert_to_target_framework( tensor: np.ndarray, framework: DeepLearningFramework ) -> Any: if framework is DeepLearningFramework.PYTORCH: return torch.from_numpy(tensor) elif framework is DeepLearningFramework.TENSORFLOW: return tf.convert_to_tensor(tensor) else: return tensor def get_input_names(onnx_model: str): model = onnx.load(onnx_model) input_all = [node.name for node in model.graph.input] return input_all def get_output_names(onnx_model: str): model = onnx.load(onnx_model) output_all = [node.name for node in model.graph.output] return output_all def run_onnx_model( onnx_model: str, input_tensors: List[np.ndarray], device: Device ) -> List[np.ndarray]: from nebullvm.optional_modules.onnxruntime import onnxruntime as ort if device.type is DeviceType.GPU and len(ONNX_PROVIDERS["cuda"]) == 3: ONNX_PROVIDERS["cuda"][1] = ( "CUDAExecutionProvider", { "device_id": device.idx, }, ) model = ort.InferenceSession( onnx_model, providers=ONNX_PROVIDERS["cuda"][1:] if device.type is DeviceType.GPU else ONNX_PROVIDERS["cpu"], ) inputs = { name: array for name, array in zip(get_input_names(onnx_model), input_tensors) } res = model.run( output_names=get_output_names(onnx_model), input_feed=inputs ) return list(res) def _extract_dynamic_axis( onnx_model: str, data: List[Tuple[Tuple[np.ndarray, ...], np.ndarray]], input_sizes: List[Tuple[int, ...]], device: Device, max_data: int = 100, ) -> Optional[Dict]: from nebullvm.tools.utils import inspect_dynamic_size dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []} output_sizes = [] for i, input_data in enumerate(data): input_tensors = input_data[0] if i >= max_data: break inspect_dynamic_size( input_tensors, input_sizes, dynamic_axis["inputs"] ) outputs = tuple( run_onnx_model(onnx_model, list(input_tensors), device) ) if i == 0: dynamic_axis["outputs"] = [{}] * len(outputs) output_sizes = [tuple(output.shape[1:]) for output in outputs] inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"]) if any( len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"]) ): return dynamic_axis return None def extract_info_from_np_data( onnx_model: str, data: List[Tuple[Tuple[np.ndarray, ...], np.ndarray]], dynamic_axis: Dict, device: Device, **kwargs, ): from nebullvm.tools.utils import ifnone input_row = data[0][0] batch_size = int(input_row[0].shape[0]) if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]): logger.warning("Detected not consistent batch size in the inputs.") input_sizes = [tuple(x.shape) for x in input_row] input_types = [ "int32" if x.dtype is np.int32 else "int64" if x.dtype is np.int64 else "float16" if x.dtype is np.float16 else "float32" for x in input_row ] dynamic_axis = ifnone( dynamic_axis, _extract_dynamic_axis(onnx_model, data, input_sizes, device), ) return batch_size, input_sizes, input_types, dynamic_axis def get_output_info_onnx( onnx_model: str, input_tensors: List[np.ndarray], device ) -> List[Tuple[Tuple[int, ...], DataType]]: res = run_onnx_model(onnx_model, input_tensors, device) sizes = [ (tuple(output.shape), DataType.from_framework_format(output.dtype)) for output in res ] return sizes def create_model_inputs_onnx(input_infos: List[InputInfo]) -> List[np.ndarray]: input_tensors = ( np.random.randn(*input_info.size).astype(np.float32) if input_info.dtype is DataType.FLOAT32 else np.random.randint( size=input_info.size, low=input_info.min_value or 0, high=input_info.max_value or 100, ) for input_info in input_infos ) return list(input_tensors) def onnx_is_gpu_available(): return ort.get_device() == "GPU" ================================================ FILE: optimization/nebullvm/nebullvm/tools/pytorch.py ================================================ from pathlib import Path from typing import List, Tuple, Optional, Dict, Union, Sequence from loguru import logger from nebullvm.core.models import Device, DataType, DeviceType, InputInfo from nebullvm.optional_modules.torch import torch, DataLoader from nebullvm.tools.data import DataManager from nebullvm.tools.diffusers import get_default_dynamic_info FX_MODULE_NAME = "NebullvmFxModule" def save_with_torch_fx(model: torch.nn.Module, path: Path): traced_model = torch.fx.symbolic_trace(model) traced_model.to_folder(path, FX_MODULE_NAME) def load_with_torch_fx( path: Path, state_dict_name: str = "pruned_state_dict.pt" ): module_file = path / "module.py" with open(module_file, "r") as f: module_str = f.read() exec(module_str, globals()) model = eval(FX_MODULE_NAME)() model.load_state_dict(torch.load(path / state_dict_name)) return model def get_output_info_torch( torch_model: torch.nn.Module, input_tensors: List[torch.Tensor], device: Device, ) -> List[Tuple[Tuple[int, ...], DataType]]: if device.type is DeviceType.GPU: input_tensors = [x.to(device.to_torch_format()) for x in input_tensors] torch_model.to(device.to_torch_format()) with torch.no_grad(): outputs = torch_model(*input_tensors) if isinstance(outputs, torch.Tensor): return [ ( tuple(outputs.size()), DataType.from_framework_format(outputs.dtype), ) ] else: return [ ( tuple(output.size()), DataType.from_framework_format(output.dtype), ) for output in outputs ] def create_model_inputs_torch( input_infos: List[InputInfo], ) -> List[torch.Tensor]: input_tensors = ( torch.randn(*input_info.size) if input_info.dtype is DataType.FLOAT32 else torch.randint( size=input_info.size, low=input_info.min_value or 0, high=input_info.max_value or 100, ) for input_info in input_infos ) return list(input_tensors) def run_torch_model( torch_model: torch.nn.Module, input_tensors: List[torch.Tensor], device: Device, dtype: torch.dtype = torch.float, ) -> List[torch.Tensor]: torch_model.eval() if device.type is DeviceType.GPU: torch_model.to(device.to_torch_format()) if dtype != torch.half: input_tensors = ( t.to(device.to_torch_format()) for t in input_tensors ) else: input_tensors = ( t.to(device.to_torch_format()).half() if t.dtype == torch.float else t.to(device.to_torch_format()) for t in input_tensors ) with torch.no_grad(): pred = torch_model(*input_tensors) if isinstance(pred, torch.Tensor): pred = [pred.cpu()] else: pred = [p.cpu() for p in pred] return pred def _extract_dynamic_axis( torch_model: torch.nn.Module, dataloader: DataManager, input_sizes: List[Tuple[int, ...]], device: Device, max_data: int = 100, ) -> Optional[Dict]: from nebullvm.tools.utils import inspect_dynamic_size dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []} output_sizes = [] for i, input_data in enumerate(dataloader): input_tensors = input_data[0] if i >= max_data: break inspect_dynamic_size( input_tensors, input_sizes, dynamic_axis["inputs"] ) outputs = tuple(run_torch_model(torch_model, input_tensors, device)) if i == 0: dynamic_axis["outputs"] = [{}] * len(outputs) output_sizes = [tuple(output.shape) for output in outputs] inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"]) if any( len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"]) ): return dynamic_axis return None def extract_info_from_torch_data( model: torch.nn.Module, dataloader: Union[DataLoader, Sequence], dynamic_axis: Dict, device: Device, is_diffusion: bool = False, ): from nebullvm.tools.utils import ifnone input_data = ( dataloader[0] if isinstance(dataloader, Sequence) else next(iter(dataloader)) ) input_row = input_data[0] batch_size = int(input_row[0].shape[0]) if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]): logger.warning("Detected not consistent batch size in the inputs.") input_sizes = [tuple(x.shape) for x in input_row] input_types = [ "int64" if isinstance(x.cpu(), torch.LongTensor) else "int32" if isinstance(x.cpu(), torch.IntTensor) else "float16" if isinstance(x.cpu(), torch.HalfTensor) else "float32" for x in input_row ] # For the Stable Diffusion UNet we must provide dynamic axis # even when using static shapes, because otherwise the converted # onnx model will have size issues. if dynamic_axis is None and device.type is DeviceType.GPU and is_diffusion: dynamic_axis = get_default_dynamic_info(input_sizes) if dynamic_axis is not None: dynamic_axis["inputs"] = [ {int(k): v for (k, v) in val.items()} for val in dynamic_axis["inputs"] ] dynamic_axis["outputs"] = [ {int(k): v for (k, v) in val.items()} for val in dynamic_axis["outputs"] ] dynamic_axis = ifnone( dynamic_axis, _extract_dynamic_axis(model, dataloader, input_sizes, device), ) return batch_size, input_sizes, input_types, dynamic_axis def torch_is_gpu_available(): return torch.cuda.is_available() def torch_get_device_name(): return torch.cuda.get_device_name(0) def get_torch_model_size( model: Union[torch.nn.Module, torch.jit.ScriptModule, torch.fx.GraphModule] ): param_size = 0 for param in model.parameters(): param_size += param.nelement() * param.element_size() buffer_size = 0 for buffer in model.buffers(): buffer_size += buffer.nelement() * buffer.element_size() return param_size + buffer_size ================================================ FILE: optimization/nebullvm/nebullvm/tools/tests/__init__.py ================================================ ================================================ FILE: optimization/nebullvm/nebullvm/tools/tests/test_data.py ================================================ import tensorflow as tf import torch from nebullvm.tools.data import DataManager def test_custom_input_data(): input_data = [ ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)), ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)), ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)), ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)), ] data_manager = DataManager(input_data) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 1 assert data_manager[0][0][0].shape == (2, 3, 10, 10) assert data_manager[0][1].shape == (2, 1) def test_torch_dataloader_single_input_with_label(): dataset = torch.utils.data.TensorDataset( torch.randn(8, 3, 10, 10), torch.randn(8, 1) ) dataloader = torch.utils.data.DataLoader(dataset, batch_size=2) data_manager = DataManager.from_dataloader(dataloader) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 1 assert data_manager[0][0][0].shape == (2, 3, 10, 10) assert data_manager[0][1].shape == (2, 1) def test_torch_dataloader_two_inputs_with_label(): dataset = torch.utils.data.TensorDataset( torch.randn(8, 3, 10, 10), torch.randn(8, 3, 10, 10), torch.randn(8, 1) ) dataloader = torch.utils.data.DataLoader(dataset, batch_size=2) data_manager = DataManager.from_dataloader(dataloader) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 2 assert data_manager[0][0][0].shape == (2, 3, 10, 10) assert data_manager[0][0][1].shape == (2, 3, 10, 10) assert data_manager[0][1].shape == (2, 1) def test_torch_dataloader_three_inputs_with_label(): dataset = torch.utils.data.TensorDataset( torch.randn(8, 3, 10, 10), torch.randn(8, 3, 10, 10), torch.randn(8, 3, 10, 10), torch.randn(8, 1), ) dataloader = torch.utils.data.DataLoader(dataset, batch_size=2) data_manager = DataManager.from_dataloader(dataloader) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 3 assert data_manager[0][0][0].shape == (2, 3, 10, 10) assert data_manager[0][0][1].shape == (2, 3, 10, 10) assert data_manager[0][0][2].shape == (2, 3, 10, 10) assert data_manager[0][1].shape == (2, 1) def test_torch_dataloader_single_input_without_label(): dataset = torch.utils.data.TensorDataset(torch.randn(8, 3, 10, 10)) dataloader = torch.utils.data.DataLoader(dataset, batch_size=2) data_manager = DataManager.from_dataloader(dataloader) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 1 assert data_manager[0][0][0].shape == (2, 3, 10, 10) def test_tensorflow_dataloader_single_input_with_label(): dataset = tf.data.Dataset.from_tensor_slices( (tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 1])) ) data_manager = DataManager.from_dataloader(dataset.batch(2)) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 1 assert data_manager[0][0][0].shape == (2, 10, 10, 3) assert data_manager[0][1].shape == (2, 1) def test_tensorflow_dataloader_two_inputs_with_label(): dataset = tf.data.Dataset.from_tensor_slices( ( tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 1]), ) ) data_manager = DataManager.from_dataloader(dataset.batch(2)) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 2 assert data_manager[0][0][0].shape == (2, 10, 10, 3) assert data_manager[0][0][1].shape == (2, 10, 10, 3) assert data_manager[0][1].shape == (2, 1) def test_tensorflow_dataloader_three_inputs_with_label(): dataset = tf.data.Dataset.from_tensor_slices( ( tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 1]), ) ) data_manager = DataManager.from_dataloader(dataset.batch(2)) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 3 assert data_manager[0][0][0].shape == (2, 10, 10, 3) assert data_manager[0][0][1].shape == (2, 10, 10, 3) assert data_manager[0][0][2].shape == (2, 10, 10, 3) assert data_manager[0][1].shape == (2, 1) def test_tensorflow_dataloader_single_input_without_label(): dataset = tf.data.Dataset.from_tensor_slices( tf.random.normal([8, 10, 10, 3]) ) data_manager = DataManager.from_dataloader(dataset.batch(2)) assert len(data_manager) == 4 assert len(data_manager[0]) == 2 assert len(data_manager[0][0]) == 1 assert data_manager[0][0][0].shape == (2, 10, 10, 3) ================================================ FILE: optimization/nebullvm/nebullvm/tools/tests/test_hardware_utils.py ================================================ import unittest from unittest.mock import patch from nebullvm.tools import hardware_utils class TestGetHwSetup(unittest.TestCase): @patch( "nebullvm.tools.hardware_utils.gpu_is_available", return_value=False ) @patch( "nebullvm.tools.hardware_utils.tpu_is_available", return_value=False ) @patch( "nebullvm.tools.hardware_utils.neuron_is_available", return_value=False ) def test_hw_setup__gpu_not_available(self, *_): setup = hardware_utils.get_hw_setup() self.assertIsNone(setup.accelerator) self.assertGreater(len(setup.cpu), 0) self.assertGreater(len(setup.operating_system), 0) self.assertGreater(setup.memory_gb, 0) @patch("nebullvm.tools.hardware_utils.gpu_is_available", return_value=True) @patch( "nebullvm.tools.hardware_utils._get_gpu_name", return_value="mock-gpu" ) def test_hw_setup__gpu_is_available(self, *_): setup = hardware_utils.get_hw_setup() self.assertEqual("mock-gpu", setup.accelerator) self.assertGreater(len(setup.cpu), 0) self.assertGreater(len(setup.operating_system), 0) self.assertGreater(setup.memory_gb, 0) ================================================ FILE: optimization/nebullvm/nebullvm/tools/tests/test_utils.py ================================================ import unittest from unittest.mock import patch from nebullvm.core.models import DeviceType from nebullvm.tools import utils class TestGetThroughput(unittest.TestCase): def test_latency_is_zero(self): self.assertEqual(-1, utils.get_throughput(0, 10)) class TestCheckDevice(unittest.TestCase): @patch("nebullvm.tools.utils.gpu_is_available", return_value=False) @patch("nebullvm.tools.utils.tpu_is_available", return_value=False) @patch("nebullvm.tools.utils.neuron_is_available", return_value=False) def test_device_is_none_no_device_available(self, *_): device = utils.check_device() self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) @patch("nebullvm.tools.utils.gpu_is_available", return_value=True) @patch("nebullvm.tools.utils.neuron_is_available", return_value=False) @patch("nebullvm.tools.utils.tpu_is_available", return_value=False) def test_device_is_none_gpu_is_available(self, *_): device = utils.check_device() self.assertEqual(DeviceType.GPU, device.type) self.assertEqual(device.idx, 0) @patch("nebullvm.tools.utils.tpu_is_available", return_value=True) @patch("nebullvm.tools.utils.gpu_is_available", return_value=False) @patch("nebullvm.tools.utils.neuron_is_available", return_value=False) def test_device_is_none_tpu_is_available(self, *_): device = utils.check_device() self.assertEqual(DeviceType.TPU, device.type) self.assertEqual(device.idx, 0) @patch("nebullvm.tools.utils.neuron_is_available", return_value=True) @patch("nebullvm.tools.utils.gpu_is_available", return_value=False) @patch("nebullvm.tools.utils.tpu_is_available", return_value=False) def test_device_is_none_neuron_is_available(self, *_): device = utils.check_device() self.assertEqual(DeviceType.NEURON, device.type) self.assertEqual(device.idx, 0) def test_device_is_cpu(self): device = utils.check_device("cpu") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) @patch("nebullvm.tools.utils.gpu_is_available", return_value=False) def test_device_is_gpu_no_gpu_available(self, _): device = utils.check_device("gpu") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("cuda") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("cuda:1") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("gpu:2") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) @patch("nebullvm.tools.utils.gpu_is_available", return_value=True) def test_device_is_gpu_gpu_is_available(self, _): device = utils.check_device("gpu") self.assertEqual(DeviceType.GPU, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("cuda") self.assertEqual(DeviceType.GPU, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("cuda:1") self.assertEqual(DeviceType.GPU, device.type) self.assertEqual(device.idx, 1) device = utils.check_device("gpu:2") self.assertEqual(DeviceType.GPU, device.type) self.assertEqual(device.idx, 2) @patch("nebullvm.tools.utils.tpu_is_available", return_value=False) def test_device_is_tpu_no_tpu_available(self, _): device = utils.check_device("tpu") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("tpu:1") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) @patch("nebullvm.tools.utils.tpu_is_available", return_value=True) def test_device_is_tpu_tpu_is_available(self, _): device = utils.check_device("tpu") self.assertEqual(DeviceType.TPU, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("tpu:1") self.assertEqual(DeviceType.TPU, device.type) self.assertEqual(device.idx, 1) @patch("nebullvm.tools.utils.neuron_is_available", return_value=False) def test_device_is_neuron_no_neuron_available(self, _): device = utils.check_device("neuron") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("neuron:1") self.assertEqual(DeviceType.CPU, device.type) self.assertEqual(device.idx, 0) @patch("nebullvm.tools.utils.neuron_is_available", return_value=True) def test_device_is_neuron_neuron_is_available(self, _): device = utils.check_device("neuron") self.assertEqual(DeviceType.NEURON, device.type) self.assertEqual(device.idx, 0) device = utils.check_device("neuron:1") self.assertEqual(DeviceType.NEURON, device.type) self.assertEqual(device.idx, 1) ================================================ FILE: optimization/nebullvm/nebullvm/tools/tf.py ================================================ from typing import Union, List, Tuple, Any, Optional, Dict import numpy as np from loguru import logger from nebullvm.core.models import Device, DataType, InputInfo from nebullvm.optional_modules.tensorflow import tensorflow as tf def get_output_info_tf( tf_model: Union[tf.Module, tf.keras.Model], input_tensors: List[tf.Tensor], device: Device, ) -> List[Tuple[Tuple[int, ...], DataType]]: with tf.device(device.to_tf_format()): outputs = tf_model(input_tensors) if isinstance(outputs, tf.Tensor) and outputs is not None: return [ ( tuple(outputs.shape), DataType.from_framework_format(outputs.dtype), ) ] return [ (tuple(x.shape), DataType.from_framework_format(x.dtype)) for x in outputs ] def create_model_inputs_tf(input_infos: List[InputInfo]) -> List[tf.Tensor]: return [ tf.random_normal_initializer()( shape=( input_info.size[0], *input_info.size[2:], input_info.size[1], ) ) if input_info.dtype is DataType.FLOAT32 else tf.random.uniform( shape=( input_info.size[0], *input_info.size[2:], input_info.size[1], ), minval=input_info.min_value or 0, maxval=input_info.max_value or 100, dtype=tf.int32, ) for input_info in input_infos ] def run_tf_model( model: tf.Module, input_tensors: Tuple[tf.Tensor], device: Device, ) -> Tuple[tf.Tensor]: with tf.device(device.to_tf_format()): pred = model(input_tensors) if isinstance(pred, tf.Tensor): pred = (pred,) return pred def _extract_dynamic_axis( tf_model: tf.Module, dataset: List[Tuple[Tuple[tf.Tensor, ...], Any]], input_sizes: List[Tuple[int, ...]], device: Device, max_data: int = 100, ) -> Optional[Dict]: from nebullvm.tools.utils import inspect_dynamic_size dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []} output_sizes = [] for i, input_data in enumerate(dataset): input_tensors = input_data[0] if i >= max_data: break inspect_dynamic_size( input_tensors, input_sizes, dynamic_axis["inputs"] ) outputs = tuple(run_tf_model(tf_model, input_tensors, device)) if i == 0: dynamic_axis["outputs"] = [{}] * len(outputs) output_sizes = [tuple(output.shape[1:]) for output in outputs] inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"]) if any( len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"]) ): return dynamic_axis return None def extract_info_from_tf_data( tf_model: tf.Module, dataset: List[Tuple[Tuple[tf.Tensor, ...], Any]], dynamic_axis: Dict, device: Device, **kwargs, ): from nebullvm.tools.utils import ifnone input_row = dataset[0][0] batch_size = int(input_row[0].shape[0]) if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]): logger.warning("Detected not consistent batch size in the inputs.") input_sizes = [tuple(x.shape) for x in input_row] input_types = [ "int32" if x.dtype in [tf.int32, np.int32] else "int64" if x.dtype in [tf.int64, np.int64] else "float16" if x.dtype in [tf.float16, np.float16] else "float32" for x in input_row ] dynamic_axis = ifnone( dynamic_axis, _extract_dynamic_axis(tf_model, dataset, input_sizes, device), ) return batch_size, input_sizes, input_types, dynamic_axis def tensorflow_is_gpu_available(): return len(tf.config.list_physical_devices("GPU")) > 0 def tensorflow_get_gpu_name(): gpu_devices = tf.config.list_physical_devices("GPU") if gpu_devices: details = tf.config.experimental.get_device_details(gpu_devices[0]) details.get("device_name", "Unknown GPU") return details["device_name"] else: return "Unknown GPU" ================================================ FILE: optimization/nebullvm/nebullvm/tools/transformations.py ================================================ import copy from abc import ABC, abstractmethod from typing import List, Any, Dict import numpy as np from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch class BaseTransformation(ABC): @abstractmethod def _transform(self, _input: Any, **kwargs) -> Any: raise NotImplementedError() def __call__(self, _input: Any, **kwargs): return self._transform(_input, **kwargs) def to_dict(self): return { "module": self.__class__.__module__, "name": self.__class__.__name__, } @classmethod def from_dict(cls, tfm_dict: Dict): return cls() class MultiStageTransformation(BaseTransformation): def __init__(self, transformations: List[BaseTransformation]): self._tfms = transformations def _transform(self, _input: Any, **kwargs) -> Any: for tfm in self._tfms: _input = tfm(_input, **kwargs) return _input def append(self, __tfm: BaseTransformation): self._tfms.append(__tfm) def extend(self, tfms: List[BaseTransformation]): self._tfms += tfms def to_dict(self) -> Dict: return {"tfms": [tfm.to_dict() for tfm in self._tfms]} def to_list(self): return self._tfms @classmethod def from_dict(cls, tfms_dict: Dict): tfms = [] for tfm_dict in tfms_dict["tfms"]: exec(f"from {tfm_dict['module']} import {tfm_dict['name']}") tfm = eval(tfm_dict["name"]).from_dict(tfm_dict) tfms.append(tfm) return cls(tfms) def copy(self): new_list = copy.deepcopy(self._tfms) return self.__class__(new_list) def __len__(self): return len(self._tfms) class HalfPrecisionTransformation(BaseTransformation): @staticmethod def _transform_numpy(_input: np.ndarray) -> np.ndarray: return _input.astype(dtype=np.float16) @staticmethod def _transform_tf(_input: tf.Tensor) -> tf.Tensor: return tf.cast(_input, tf.float16) @staticmethod def _transform_torch(_input: torch.Tensor) -> torch.Tensor: return _input.half() def _transform(self, _input: Any, **kwargs) -> Any: if isinstance(_input, np.ndarray): return ( self._transform_numpy(_input) if _input.dtype == np.float32 else _input ) elif isinstance(_input, torch.Tensor): return ( self._transform_torch(_input) if _input.dtype == torch.float32 else _input ) elif isinstance(_input, tf.Tensor) and _input is not None: return ( self._transform_tf(_input) if _input.dtype == tf.float32 else _input ) else: raise TypeError( f"The given input type is not currently supported. " f"Got {type(_input)}, expected one between (np.ndarray, " f"torch.Tensor, tf.Tensor)" ) class NoOp(BaseTransformation): def _transform(self, _input: Any, **kwargs): return _input class VerifyContiguity(BaseTransformation): def _transform(self, _input: Any, **kwargs) -> Any: if not isinstance(_input, torch.Tensor): return _input if not _input.is_contiguous(): _input = _input.contiguous() return _input ================================================ FILE: optimization/nebullvm/nebullvm/tools/utils.py ================================================ import os import subprocess import sys import uuid from pathlib import Path from types import ModuleType from typing import ( Tuple, Any, List, Dict, Union, Iterable, Sequence, Optional, Callable, ) import numpy as np from loguru import logger from packaging import version from nebullvm.core.models import ( DeepLearningFramework, Device, ModelParams, DeviceType, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.tools.data import DataManager from nebullvm.tools.onnx import ( extract_info_from_np_data, get_output_info_onnx, ) from nebullvm.tools.pytorch import ( extract_info_from_torch_data, get_output_info_torch, ) from nebullvm.tools.tf import ( extract_info_from_tf_data, get_output_info_tf, ) def get_model_size_mb(model: Any) -> float: if isinstance(model, str): size = os.stat(model).st_size elif isinstance(model, Path): size = os.path.getsize(model.as_posix()) elif isinstance(model, torch.nn.Module): size = sum(p.nelement() * p.element_size() for p in model.parameters()) else: # we assume it is a tf_model # assuming full precision 32 bit size = model.count_params() * 4 return round(size * 1e-6, 2) def get_model_name(model: Any) -> str: if isinstance(model, str): return model if isinstance(model, Path): return model.as_posix() return model.__class__.__name__ def generate_model_id(model: Any) -> str: model_name = get_model_name(model) return f"{str(uuid.uuid4())}_{hash(model_name)}" def get_throughput(latency: float, batch_size: int = 1) -> float: if latency == 0: return -1 return (1 / latency) * batch_size def ifnone(target, new_value): if target is None: return new_value else: return target def inspect_dynamic_size( tensors: Tuple[Any, ...], sizes: List[Tuple[int, ...]], axis_list: List[Dict], ): for idx, (tensor, size) in enumerate(zip(tensors, sizes)): for idy, (j, k) in enumerate(zip(tensor.shape, size)): if j != k: if idy == 0: tag = "batch_size" else: tag = f"val_{j}_{k}" axis_list[idx][idy] = tag def gpu_is_available(): try: subprocess.check_output("nvidia-smi") return True except Exception: return False def neuron_is_available(): try: subprocess.check_output("neuron-ls") return True except Exception: return False def tpu_is_available(): # Check if a tpu is available try: import torch_xla import torch_xla.core.xla_model as xm return xm.xla_device_hw(torch_xla.core.xla_model.xla_device()) == "TPU" except Exception: return False def check_module_version( module: ModuleType, min_version: str = None, max_version: str = None ) -> bool: installed_version = module.__version__ if min_version is not None: if version.parse(installed_version) < version.parse(min_version): return False if max_version is not None: if version.parse(installed_version) > version.parse(max_version): return False return True def is_python_version_3_10(): return ( str(sys.version_info.major) + "." + str(sys.version_info.minor) == "3.10" ) def get_dl_framework(model: Any): if isinstance(model, torch.nn.Module): return DeepLearningFramework.PYTORCH elif isinstance(model, tf.Module) and model is not None: return DeepLearningFramework.TENSORFLOW elif isinstance(model, str): if Path(model).is_file(): return DeepLearningFramework.NUMPY else: raise FileNotFoundError( f"No file '{model}' found, please provide a valid path to " f"a model." ) else: raise TypeError(f"Model type {type(model)} not supported.") def check_input_data(input_data: Union[Iterable, Sequence]): try: assert len(input_data) > 0 assert isinstance(input_data[0], tuple) assert isinstance(input_data[0][0], tuple) assert isinstance( input_data[0][0][0], (np.ndarray, torch.Tensor, tf.Tensor) ) if len(input_data[0]) > 1: assert isinstance( input_data[0][1], (np.ndarray, torch.Tensor, tf.Tensor, int, float, type(None)), ) except: # noqa E722 return False else: return True def is_data_subscriptable(input_data: Union[Iterable, Sequence]): try: input_data[0] except: # noqa E722 return False else: return True def check_dynamic_info_inputs( dynamic_info: Optional[Dict], input_sample: Tuple[Any] ): if dynamic_info is not None: assert dynamic_info.get("inputs") is not None, ( "Dynamic info must contain an 'inputs' key with a list of " "dictionaries as value." ) num_dynamic_inputs = len(dynamic_info["inputs"]) num_model_inputs = len(input_sample) assert len(dynamic_info["inputs"]) == len(input_sample), ( f"The number of dynamic inputs provided in the dynamic info " f"dict ({num_dynamic_inputs}) is not equal to the number " f"of inputs of the model ({num_model_inputs}). Detected model " f"input shapes are: {[input.shape for input in input_sample]} " ) assert dynamic_info.get("outputs") is not None, ( "Dynamic info must contain an 'outputs' key with a list of " "dictionaries as value." ) def extract_info_from_data( model: Any, input_data: DataManager, dl_framework: DeepLearningFramework, dynamic_info: Optional[Dict], device: Device, is_diffusion: bool = False, ): check_dynamic_info_inputs(dynamic_info, input_data.get_list(1)[0]) batch_size, input_sizes, input_types, dynamic_info = INFO_EXTRACTION_DICT[ dl_framework ]( model, input_data, dynamic_axis=dynamic_info, device=device, is_diffusion=is_diffusion, ) output_infos = OUTPUT_INFO_COMPUTATION_DICT[dl_framework]( model, input_data[0][0], device ) model_params = ModelParams( batch_size=batch_size, input_infos=[ {"size": size, "dtype": dtype} for size, dtype in zip(input_sizes, input_types) ], output_sizes=[info[0] for info in output_infos], output_types=[info[1] for info in output_infos], dynamic_info=dynamic_info, ) return model_params def is_huggingface_data(data_sample: Any) -> bool: if is_dict_type(data_sample): return True elif isinstance(data_sample, str): return True elif isinstance(data_sample[0], str): return True return False def is_dict_type(data_sample: Any): try: data_sample.items() except AttributeError: return False else: return True def _get_idx(device: str) -> int: device_info = device.split(":") if len(device_info) == 2 and device_info[1].isdigit(): idx = int(device_info[1]) else: idx = 0 return idx def _set_device( accelerator_is_available: bool, device_type: DeviceType, idx: int ) -> Device: if not accelerator_is_available: logger.warning( f"Selected {device_type.name} device but no available " f"{device_type.name} found on this platform. CPU will " f"be used instead. Please make sure that the " f"{device_type.name} is installed and can be used by your " "framework." ) device = Device(DeviceType.CPU) else: device = Device(device_type, idx=idx) return device def check_device(device: Optional[str] = None) -> Device: if device is None: if gpu_is_available(): device = Device(DeviceType.GPU) elif neuron_is_available(): device = Device(DeviceType.NEURON) elif tpu_is_available(): device = Device(DeviceType.TPU) else: device = Device(DeviceType.CPU) else: if any(x in device.lower() for x in ["cuda", "gpu"]): device = _set_device( accelerator_is_available=gpu_is_available(), device_type=DeviceType.GPU, idx=_get_idx(device), ) elif "neuron" in device.lower(): device = _set_device( accelerator_is_available=neuron_is_available(), device_type=DeviceType.NEURON, idx=_get_idx(device), ) elif "tpu" in device.lower(): device = _set_device( accelerator_is_available=tpu_is_available(), device_type=DeviceType.TPU, idx=_get_idx(device), ) else: device = Device(DeviceType.CPU) return device def get_gpu_compute_capability(gpu_idx: int) -> float: compute_capability = subprocess.check_output( ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"] ).decode("utf-8") return float(compute_capability.split("\n")[gpu_idx]) INFO_EXTRACTION_DICT: Dict[DeepLearningFramework, Callable] = { DeepLearningFramework.PYTORCH: extract_info_from_torch_data, DeepLearningFramework.TENSORFLOW: extract_info_from_tf_data, DeepLearningFramework.NUMPY: extract_info_from_np_data, } OUTPUT_INFO_COMPUTATION_DICT: Dict[DeepLearningFramework, Callable] = { DeepLearningFramework.PYTORCH: get_output_info_torch, DeepLearningFramework.TENSORFLOW: get_output_info_tf, DeepLearningFramework.NUMPY: get_output_info_onnx, } ================================================ FILE: optimization/nebullvm/nebullvm/tools/venv.py ================================================ import subprocess import tempfile import venv from loguru import logger class EnvBuilder(venv.EnvBuilder): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.context = None def post_setup(self, context): self.context = context def run_in_different_venv( requirements_file: str, script_path: str, use_gpu: bool, *args, ): """Run a python scripts in a new temporary environment. Arguments for the script must be passed in the function args. it is equivalent to create and activate a new environment and running > pip install -r $requirement_file > python -m script_path *args Args: requirements_file (str): File (.txt) containing the list of requirements. script_path (str): Path to the script that must be run. args: Arguments of the script. """ logger.debug(f"Debug: Running script {script_path} in a new virtual env.") with tempfile.TemporaryDirectory() as target_dir_path: logger.debug("Debug: Creating virtual environment...") venv_builder = EnvBuilder(with_pip=True) venv_builder.create(str(target_dir_path)) venv_context = venv_builder.context logger.debug("Debug: Installing requirements...") if use_gpu: pip_install_command = [ venv_context.env_exe, "-m", "pip", "install", "torch==1.9.1+cu111", "torchvision==0.10.1+cu111", "-f", "https://download.pytorch.org/whl/torch_stable.html", ] else: pip_install_command = [ venv_context.env_exe, "-m", "pip", "install", "torch<=1.9.1", "torchvision<=0.10.1", ] subprocess.check_call(pip_install_command) pip_install_command = [ venv_context.env_exe, "-m", "pip", "install", "-r", requirements_file, ] subprocess.check_call(pip_install_command) logger.debug("Debug: Executing script...") script_command = [venv_context.env_exe, script_path, *args] subprocess.check_call(script_command) ================================================ FILE: optimization/nebullvm/nebullvm.toml ================================================ [build-system] requires = [ "setuptools>=42", "wheel" ] build-backend = "setuptools.build_meta" ================================================ FILE: optimization/nebullvm/requirements-dev.txt ================================================ pytest pytest-mock torchvision sentencepiece ================================================ FILE: optimization/nebullvm/requirements.txt ================================================ numpy>=1.21.0, <1.24.0 packaging>=21.3 py-cpuinfo==8.0.0 PyYAML>=6.0 psutil>=5.0.0 requests>=2.26.1 tqdm>=4.36.0 loguru>=0.5.3 ================================================ FILE: optimization/nebullvm/setup.py ================================================ from pathlib import Path from setuptools import setup, find_packages REQUIREMENTS = [ "numpy>=1.21.0, <1.24.0", "py-cpuinfo>=8.0.0", "PyYAML>=6.0", "psutil>=5.0.0", "requests>=2.26.0", "tqdm>=4.36.0", "packaging>=21.3", "loguru>=0.5.3", ] this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text(encoding="utf8") setup( name="nebullvm", version="0.10.0", packages=find_packages(), install_requires=REQUIREMENTS, long_description=long_description, include_package_data=True, long_description_content_type="text/markdown", ) ================================================ FILE: optimization/open_alpha_tensor/README.md ================================================ # 🐉 OpenAlphaTensor OpenAlphaTensor provides an open-source implementation of Deepmind's AlphaTensor algorithm. With OpenAlphaTensor, you can increase the computational performances of an AI model with custom-generated matrix multiplication algorithms. You can train your own AlphaTensor algorithm for a specific matrix size or fine-tune a pre-trained AlphaTensor model to produce optimized kernels for a specific hardware. OpenAlphaTensor is based on Deepmind's paper [Discovering Faster Matrix Multiplication Algorithms with Reinforcement Learning](https://www.nature.com/articles/s41586-022-05172-4). If you appreciate the project, show it by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers) ## 🧑‍🏫 Installation You can install the package cloning the repository and running the following commands: ```bash git clone https://github.com/nebuly-ai/nebullvm.git cd nebullvm/apps/accelerate/open_alpha_tensor pip install -e . ``` ## 🚀 Get started For training your AlphaTensor model, you can execute the following command: ```bash python main.py ``` Model parameters can be given either as command line arguments or as a JSON file. The `config.json` file contains the default parameters for training a model for matrix size 4x4x4. Alternatively, if you want to have a more fine-grained control over the training process, you can use the python API: ```python from open_alpha_tensor import train_alpha_tensor cardinality_vector = 5 # The actions can have values in range [-2, 2] N_bar = 100 # parameter for smoothing the temperature while adjusting the probability distribution matrix_size = 5 input_size = matrix_size**2 n_steps = 15 n_actions = cardinality_vector ** (3 * input_size // n_steps) action_memory = 7 train_alpha_tensor( tensor_length=action_memory + 1, input_size=input_size, scalars_size=1, emb_dim=2048, n_steps=n_steps, n_logits=n_actions, n_samples=32, device="cuda", len_data=2048, n_synth_data=1000000, pct_synth=0.7, batch_size=32, epochs=600000, lr=1e-4, lr_decay_factor=0.5, lr_decay_steps=5000, weight_decay=1e-5, optimizer_name="adamw", loss_params=(1, 1), limit_rank=150, checkpoint_dir="path/to/checkpoint/dir", checkpoint_data_dir="path/where/to/save/data/generated/by/the/model", n_actors=1, mc_n_sim=200, n_cob=100000, cob_prob=0.9983, data_augmentation=True, N_bar=N_bar, random_seed=42, extra_devices=None, save_dir="path/to/save/final/model", ) ``` ## 🧪 Missing features - [ ] Release weights of pre-trained models. **Coming out soon**. - [ ] Add compilation of Alpha Tensor kernels in OpenAI's Triton and JAX/XLA. - [ ] Add support for fine-tuning on target hardware. - [ ] Support training on Multiple GPUs (it allows training on a larger batch size). - [ ] Add support for other compilers (e.g. llvm). - [ ] Reduce memory footprint of the Acting Agent. - [ ] Improve acting speed. ## 💫 Contributing We welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the [linked](https://docs.nebuly.com/contributions) page for more information on how to get involved. A special thanks to [BrianPulfer](https://github.com/BrianPulfer) for his awesome contribution to the OpenAlphaTensor module. ================================================ FILE: optimization/open_alpha_tensor/config.json ================================================ { "batch_size": 16, "max_epochs": 600000, "action_memory": 7, "optimizer": "adamw", "weight_decay": 1e-5, "lr": 1e-4, "lr_decay_factor": 0.1, "lr_decay_steps": 500000, "device": "cuda:0", "len_data": 2048, "pct_synth": 0.9, "n_synth_data": 100000, "limit_rank": 125, "alpha": 1.0, "beta": 1.0, "matrix_size": 4, "embed_dim": 1024, "actions_sampled": 32, "n_actors": 1, "mc_n_sim": 200, "n_cob": 100000, "cob_prob": 0.9983, "cardinality_vector": 5, "n_bar": 100 } ================================================ FILE: optimization/open_alpha_tensor/main.py ================================================ import json import os from argparse import ArgumentParser from pathlib import Path from open_alpha_tensor import train_alpha_tensor def _compute_largest_divisor(n: int) -> int: """Compute the largest divisor of n.""" for i in range(n // 2, 0, -1): if n % i == 0: return i return 1 def main(): config_file = Path(os.getenv("CONFIG_FILE", "config.json")) if config_file.exists(): with open(config_file) as f: config = json.load(f) else: config = {} parser = ArgumentParser() parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--max_epochs", type=int, default=1) parser.add_argument("--action_memory", type=int, default=1) parser.add_argument("--optimizer", type=str, default="adamw") parser.add_argument("--weight_decay", type=float, default=1e-5) parser.add_argument("--lr", type=float, default=1e-4) parser.add_argument("--lr_decay_factor", type=float, default=0.5) parser.add_argument("--lr_decay_steps", type=int, default=5000) parser.add_argument("--device", type=str, default="cuda") # parser.add_argument("--half", action="store_true") parser.add_argument("--len_data", type=int, default=100) parser.add_argument("--pct_synth", type=float, default=0.5) parser.add_argument("--n_synth_data", type=int, default=100) parser.add_argument("--limit_rank", type=int, default=15) parser.add_argument("--alpha", type=float, default=1.0) parser.add_argument("--beta", type=float, default=1.0) parser.add_argument("--random_seed", type=int, default=None) parser.add_argument("--checkpoint_dir", type=str, default=None) parser.add_argument("--checkpoint_data_dir", type=str, default=None) parser.add_argument("--matrix_size", type=int, default=3) parser.add_argument("--embed_dim", type=int, default=1024) parser.add_argument("--actions_sampled", type=int, default=10) parser.add_argument("--n_actors", type=int, default=1) parser.add_argument("--mc_n_sim", type=int, default=100) parser.add_argument("--n_cob", type=int, default=100000) parser.add_argument("--cob_prob", type=float, default=0.9983) # 1 - 0.0017 parser.add_argument("--data_augmentation", action="store_true") parser.add_argument("--cardinality_vector", type=int, default=5) parser.add_argument( "--n_bar", type=int, default=100, help="N_bar parameter for policy temperature.", ) parser.add_argument("--save_dir", type=str, default=None) parser.add_argument("extra_devices", nargs="*", type=str, default=[]) parser.set_defaults(**config) args = parser.parse_args() cardinality_vector = args.cardinality_vector N_bar = args.n_bar input_size = args.matrix_size**2 n_steps = _compute_largest_divisor(input_size) n_actions = cardinality_vector ** (3 * input_size // n_steps) loss_params = (args.alpha, args.beta) train_alpha_tensor( tensor_length=args.action_memory + 1, input_size=input_size, scalars_size=1, emb_dim=args.embed_dim, n_steps=n_steps, n_logits=n_actions, n_samples=args.actions_sampled, device=args.device, len_data=args.len_data, n_synth_data=args.n_synth_data, pct_synth=args.pct_synth, batch_size=args.batch_size, epochs=args.max_epochs, lr=args.lr, lr_decay_factor=args.lr_decay_factor, lr_decay_steps=args.lr_decay_steps, weight_decay=args.weight_decay, optimizer_name=args.optimizer, loss_params=loss_params, limit_rank=args.limit_rank, random_seed=args.random_seed, checkpoint_dir=args.checkpoint_dir, checkpoint_data_dir=args.checkpoint_data_dir, n_actors=args.n_actors, mc_n_sim=args.mc_n_sim, n_cob=args.n_cob, cob_prob=args.cob_prob, data_augmentation=args.data_augmentation or False, N_bar=N_bar, extra_devices=args.extra_devices, save_dir=args.save_dir, ) if __name__ == "__main__": main() ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/__init__.py ================================================ from open_alpha_tensor.api.functions import train_alpha_tensor # noqa: F401 ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/api/__init__.py ================================================ ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/api/functions.py ================================================ from typing import List, Tuple from open_alpha_tensor.root_op import TrainAlphaTensorRootOp def train_alpha_tensor( tensor_length: int, input_size: int, scalars_size: int, emb_dim: int, n_steps: int, n_logits: int, n_samples: int, optimizer_name: str, lr: float, lr_decay_factor: float, lr_decay_steps: int, weight_decay: float, loss_params: Tuple[float, float], checkpoint_dir: str, checkpoint_data_dir: str, epochs: int, batch_size: int, len_data: int, n_synth_data: int, pct_synth: float, limit_rank: int, n_actors: int, mc_n_sim: int, N_bar: int, device: str, save_dir: str, random_seed: int, n_cob: int, cob_prob: float, data_augmentation: bool, extra_devices: List[str], ): """Trains an AlphaTensor model to learn more efficient matrix multiplications and returns it. Args: tensor_length (int): Number of tensors to as history. input_size (int): Flattened size of the matrices to be multiplied. scalars_size (int): Size of the scalar vectors fed to the torso model. emb_dim (int): Embedding dimension. n_steps (int): Number of steps used to get a single action out of a triplet. n_logits (int): Number of logits output by the policy head. n_samples (int): Number of samples used by the policy head at evaluation time. optimizer_name (str): Name of the optimizer used. lr (float): Learning rate. lr_decay_factor (float): Learning rate's decay factor. lr_decay_steps (int): Number of learning rate's decay steps. weight_decay (float): Weight decay used by the optimizer. loss_params (Tuple[float, float]): Alpha and Beta parameters used in the loss function. checkpoint_dir (str): Directory used to store model checkpoints. checkpoint_data_dir (str): Directory used to store games as JSON files. epochs (int): Number of training epochs. batch_size (int): Batch size. len_data (int): Number of training samples used (both actor generated and synthetic). n_synth_data (int): Number of synthetic training samples. pct_synth (float): Initial percentage of synthetic samples used for training. limit_rank (int): Maximum number of steps per episode and maximum rank for synthetically-generated matrices. n_actors (int): Number of actors to play a single each game at each training step. mc_n_sim (int): Number of simulations during Monte Carlo tree search. N_bar (int): N_bar parameter used to compute tau when improving the policy. device (str): The name of the torch device used for training. save_dir (str): Directory where the final trained model will be stored. random_seed (int): Randomizing seed. n_cob (int): Number of change of basis (cob) used for a single training sample. cob_prob (float): Probability of applying a change of basis. data_augmentation (bool): Whether to randomly swap the last operation of an episode with another operation. extra_devices (List[str]): Extra devices names used for multi-GPU training. """ root_op = TrainAlphaTensorRootOp() root_op.execute( tensor_length=tensor_length, input_size=input_size, scalars_size=scalars_size, emb_dim=emb_dim, n_steps=n_steps, n_logits=n_logits, n_samples=n_samples, optimizer_name=optimizer_name, lr=lr, lr_decay_factor=lr_decay_factor, lr_decay_steps=lr_decay_steps, weight_decay=weight_decay, loss_params=loss_params, checkpoint_dir=checkpoint_dir, checkpoint_data_dir=checkpoint_data_dir, epochs=epochs, batch_size=batch_size, len_data=len_data, n_synth_data=n_synth_data, pct_synth=pct_synth, limit_rank=limit_rank, n_actors=n_actors, mc_n_sim=mc_n_sim, N_bar=N_bar, device=device, save_dir=save_dir, random_seed=random_seed, n_cob=n_cob, cob_prob=cob_prob, data_augmentation=data_augmentation, extra_devices=extra_devices, ) return root_op.get_result() ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/config.py ================================================ BASE_CHECKPOINT_DIR = "checkpoints" BASE_CHECKPOINT_DATA_DIR = "games" ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/__init__.py ================================================ ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/actors/__init__.py ================================================ ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/actors/stage.py ================================================ from typing import Dict, List import torch from open_alpha_tensor.core.data.utils import ( get_scalars, map_action_to_triplet, ) from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel def game_is_finished(state): """Tells if the game is finished or not. Args: state (torch.Tensor): The state of the game. """ # state size (1, S, S, S) return (state == 0).all() def remove_duplicates(reducing_tensor: torch.Tensor): """Remove duplicates from a tensor. Args: reducing_tensor (torch.Tensor): The tensor to remove duplicates from. """ # reducing tensor has shape (1, N_mc, S, S, S) n_mc = reducing_tensor.shape[1] indexes = [] idx_map = {} for idx in range(n_mc): if len(indexes) == 0: indexes.append(idx) idx_map[idx] = [] else: idx_tensor = reducing_tensor[:, idx] for index in indexes: if (reducing_tensor[:, index] - idx_tensor == 0).all(): idx_map[index].append(idx) break else: indexes.append(idx) idx_map[idx] = [] # idx_map = {i: len(v) for i, v in enumerate(idx_map.values())} old_idx_to_new_idx_map = {} for new_idx, (key, values) in enumerate(idx_map.items()): old_idx_to_new_idx_map[key] = new_idx for second_idx in values: old_idx_to_new_idx_map[second_idx] = new_idx return ( reducing_tensor[:, indexes], old_idx_to_new_idx_map, idx_map, indexes, ) def extract_children_states_from_actions( state: torch.Tensor, actions: torch.Tensor, vec_cardinality: int = 5, ): """Extract the children states from the actions. Args: state (torch.Tensor): The state of the game. actions (torch.Tensor): The actions to apply to the state. vec_cardinality (int, optional): The cardinality of the vectors. """ # state (1, T, S, S, S) # actions (1, K, N_steps) # we assume actions to be with N_steps = 1, # and N_logits = |F|^(3S/N_steps). Each action is then mapped in a # unique way to a triplet (u, v, w) where each vector has size S. # vector cardinality represents the number of values it can take an entry # of u, v or w. bs, k, n_steps = actions.shape[:3] len_token = 3 * state.shape[2] // n_steps actions = map_action_to_triplet(actions, vec_cardinality, len_token) actions = actions.reshape(bs, k, n_steps * len_token) vec_dim = state.shape[2] u = actions[:, :, :vec_dim].reshape(bs, k, vec_dim, 1, 1) v = actions[:, :, vec_dim : 2 * vec_dim].reshape( # noqa E203 bs, k, 1, vec_dim, 1 ) w = actions[:, :, 2 * vec_dim :].reshape(bs, k, 1, 1, vec_dim) # noqa E203 reducing_tensor = u * v * w ( reducing_tensor, old_idx_to_new_idx, repetition_map, not_duplicate_indexes, ) = remove_duplicates(reducing_tensor) old_state = state[:, 0] new_state = old_state.unsqueeze(1) - reducing_tensor rolling_states = torch.roll(state, 1)[:, 2:] return ( [ torch.cat( [ new_state[:, i : i + 1], # noqa E203 reducing_tensor[:, i : i + 1], # noqa E203 rolling_states, ], dim=1, ) for i in range(k) ], old_idx_to_new_idx, repetition_map, not_duplicate_indexes, ) def _reduce_memory_consumption_before_storing( possible_states: List[torch.Tensor], ): """Reduce the memory consumption before storing the states. Args: possible_states (List[torch.Tensor]): The possible states. """ final_states = [state[:, 0:2] for state in possible_states] previous_actions = possible_states[0][:, 2:] storing_dict = { "final_states": final_states, "previous_actions": previous_actions, } return storing_dict def _recompose_possible_states(reduced_memory_states_dict: Dict): """Recompose the possible states from the reduced memory states. Args: reduced_memory_states_dict (Dict): The reduced memory states. """ final_states = reduced_memory_states_dict["final_states"] previous_actions = reduced_memory_states_dict["previous_actions"] possible_states = [ torch.cat( [ final_states[i], previous_actions, ], dim=1, ) for i in range(len(final_states)) ] return possible_states def extract_present_state(state: torch.Tensor) -> torch.Tensor: return state[:, 0] def to_hash(tensor: torch.Tensor) -> str: """Converts a tensor to a hash string. Args: tensor: The tensor to convert. """ hashable_tensor = "_".join( tensor.reshape(-1).long().detach().cpu().numpy().astype(str).tolist() ) return hashable_tensor def from_hash(hashable_tensor: str, shape: tuple) -> torch.Tensor: """Converts a hash string back to the original tensor. Args: hashable_tensor (str): The hash string. shape (tuple): The shape of the original tensor. """ return torch.tensor([float(x) for x in hashable_tensor.split("_")]).resize( shape ) def record_action(tree_dict: Dict, state: str, action: str): """Record the action in the tree dictionary. Args: tree_dict (Dict): The tree dictionary. state (str): The state as a hash string. action (str): The action as a hash string. """ if state in tree_dict: tree_dict[state].append(action) else: tree_dict[state] = [action] def select_future_state( possible_states: List[torch.Tensor], q_values: torch.Tensor, N_s_a: torch.Tensor, repetitions: Dict[int, list], c_1: float = 1.25, c_2: float = 19652, return_idx: bool = False, ) -> torch.Tensor: """Select the future state maximizing the upper confidence bound.""" # q_values (1, K, 1) pi = torch.tensor( [ len(repetitions[i]) for i in range(len(possible_states)) if i in repetitions ] ).to(q_values.device) if pi.shape[0] != N_s_a.shape[1]: print(pi) print(pi.shape, q_values.shape, N_s_a.shape) pi = pi[: N_s_a.shape[1]] ucb = q_values.reshape(-1) + pi * torch.sqrt( torch.sum(N_s_a) / (1 + N_s_a) ) * (c_1 + torch.log((torch.sum(N_s_a) + c_2 + 1) / c_2)) if return_idx: return ucb.argmax() return possible_states[ucb.argmax()] @torch.no_grad() def simulate_game( model, state: torch.Tensor, t_time: int, max_steps: int, game_tree: Dict, states_dict: Dict, horizon: int = 5, ): """Simulates a game from a given state. Args: model: The model to use for the simulation. state (torch.Tensor): The initial state. t_time (int): The current time step. max_steps (int): The maximum number of steps to simulate. game_tree (Dict): The game tree. states_dict (Dict): The states dictionary. horizon (int): The horizon to use for the simulation. """ idx = t_time max_steps = min(max_steps, t_time + horizon) state_hash = to_hash(extract_present_state(state)) trajectory = [] # selection while state_hash in game_tree: ( possible_states_dict, old_idx_to_new_idx, repetition_map, N_s_a, q_values, actions, ) = states_dict[state_hash] possible_states = _recompose_possible_states(possible_states_dict) state_idx = select_future_state( possible_states, q_values, N_s_a, repetition_map, return_idx=True ) trajectory.append((state_hash, state_idx)) # state_hash, action_idx future_state = extract_present_state(possible_states[state_idx]) state = possible_states[state_idx] state_hash = to_hash(future_state) idx += 1 # expansion if idx <= max_steps: trajectory.append((state_hash, None)) if not game_is_finished(extract_present_state(state)): state = state.to(model.device) scalars = get_scalars(state, idx).to(state.device) actions, probs, q_values = model(state, scalars) ( possible_states, cloned_idx_to_idx, repetitions, not_dupl_indexes, ) = extract_children_states_from_actions( state, actions, ) not_dupl_actions = actions[:, not_dupl_indexes].to("cpu") not_dupl_q_values = torch.zeros(not_dupl_actions.shape[:-1]).to( "cpu" ) N_s_a = torch.zeros_like(not_dupl_q_values).to("cpu") present_state = extract_present_state(state) states_dict[to_hash(present_state)] = ( _reduce_memory_consumption_before_storing(possible_states), cloned_idx_to_idx, repetitions, N_s_a, not_dupl_q_values, not_dupl_actions, ) game_tree[to_hash(present_state)] = [ to_hash(extract_present_state(fut_state)) for fut_state in possible_states ] leaf_q_value = q_values else: leaf_q_value = -int(torch.linalg.matrix_rank(state).sum()) # backup backward_pass(trajectory, states_dict, leaf_q_value=leaf_q_value) def backward_pass(trajectory, states_dict, leaf_q_value: torch.Tensor): """Backward pass of the montecarlo algorithm""" reward = 0 for idx, (state, action_idx) in enumerate(reversed(trajectory)): if action_idx is None: # leaf node reward += leaf_q_value else: ( _, old_idx_to_new_idx, _, N_s_a, q_values, _, ) = states_dict[state] if isinstance(reward, torch.Tensor): reward = reward.to(q_values.device) action_idx = int(action_idx) if action_idx in old_idx_to_new_idx: not_dupl_index = old_idx_to_new_idx[int(action_idx)] else: not_dupl_index = action_idx reward -= 1 q_values[:, not_dupl_index] = ( N_s_a[:, not_dupl_index] * q_values[:, not_dupl_index] + reward ) / (N_s_a[:, not_dupl_index] + 1) N_s_a[:, not_dupl_index] += 1 def monte_carlo_tree_search( model: torch.nn.Module, state: torch.Tensor, n_sim: int, t_time, n_steps: int, game_tree: Dict, state_dict: Dict, ): """Runs the monte carlo tree search algorithm. Args: model (torch.nn.Module): The model to use for the simulation. state (torch.Tensor): The initial state. n_sim (int): The number of simulations to run. t_time (int): The current time step. n_steps (int): The maximum number of steps to simulate. game_tree (Dict): The game tree. state_dict (Dict): The dictionary containing the states. """ # Note that game tree is not the full tree, but just the one having as root # the current node(state). # should we accept also previous updated trajectories for the current node? # is it something we should considering when deciding how many simulations # we should run? (I think yes) state_hash = to_hash(extract_present_state(state)) if state_hash in state_dict: with torch.no_grad(): N_s_a = state_dict[state_hash][3] n_sim -= int(N_s_a.sum()) n_sim = max(n_sim, 0) for _ in range(n_sim): simulate_game(model, state, t_time, n_steps, game_tree, state_dict) # return next state possible_states_dict, _, repetitions, N_s_a, q_values, _ = state_dict[ state_hash ] possible_states = _recompose_possible_states(possible_states_dict) next_state_idx = select_future_state( possible_states, q_values, N_s_a, repetitions, return_idx=True ) next_state = possible_states[next_state_idx] return next_state @torch.no_grad() def compute_improved_policy( state_dict: Dict, states: List[str], model_n_steps: int, model_n_logits: int, N_bar: int, ): """Compute the improved policy given the state_dict, the list of states. The improved policy is computed as (N_s_aˆ(1/tau) / (N_s_aˆ(1/tau)).sum()) where tau is (log(N_s_a.sum()) / log(N_bar)) """ policies = torch.zeros(len(states), model_n_steps, model_n_logits) N_bar = torch.tensor(N_bar) for idx, state in enumerate(states): N_s_a = state_dict[state][3] actions = state_dict[state][5] if N_s_a.sum() > N_bar: tau = (torch.log(N_s_a.sum()) / torch.log(N_bar)).item() else: tau = 1 N_s_a = N_s_a ** (1 / tau) improved_policy = N_s_a / N_s_a.sum() for sample_id in range(actions.shape[1]): action_ids = actions[0, sample_id] for step_id, action_id in enumerate(action_ids): policies[idx, step_id, action_id] += improved_policy[ 0, sample_id ] return policies def actor_prediction( model: AlphaTensorModel, input_tensor: torch.Tensor, maximum_rank: int, mc_n_sim: int, N_bar: int, return_actions: bool = False, ): """Runs the monte carlo tree search algorithm to obtain the next states, policies and rewards. Args: model (AlphaTensorModel): The model to use for the simulation. input_tensor (torch.Tensor): The initial state. maximum_rank (int): The maximum number of steps to simulate. mc_n_sim (int): The number of simulations to run. N_bar (int): The parameter used to compute the improved policy. return_actions (bool): If True, only actions are returned. """ # input_tensor has shape (1, T, S, S, S) state = input_tensor rank = 0 game_tree = {} state_dict = {} hash_states = [] states = [] while rank < maximum_rank: states.append(state) hash_states.append(to_hash(extract_present_state(state))) state = monte_carlo_tree_search( model, state, mc_n_sim, rank, maximum_rank, game_tree, state_dict, ) if game_is_finished(extract_present_state(state)): break rank += 1 final_state = extract_present_state(state) policies = compute_improved_policy( state_dict, hash_states, model.n_steps, model.n_logits, N_bar ) reward = ( int(torch.linalg.matrix_rank(final_state).sum()) if not game_is_finished(final_state) else 0 ) rewards = torch.cumsum( torch.tensor([-1] * (len(policies) - 1) + [reward]), dim=0 ) if return_actions: actions = [state_dict[hash_state][5] for hash_state in hash_states] return actions # policies do not have the batch size, but states still have it states = [s.squeeze(0) for s in states] return states, policies, rewards ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/__init__.py ================================================ ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/basis_change.py ================================================ from pathlib import Path from typing import Callable import numpy as np import torch def get_change_basis_matrix( tensor_size: int, n_cob: int, entry_distribution: Callable = torch.randn, random_seed: int = None, ): """Generate a list of change of basis matrices. Args: tensor_size (int): Size of the tensor. n_cob (int): Number of change of basis matrices. entry_distribution (Callable, optional): Distribution of the entries of the change of basis matrices. random_seed (int, optional): Random seed for reproducibility. """ if random_seed is not None: torch.random.manual_seed(random_seed) for _ in range(n_cob): diag_p = 2 * (torch.rand(tensor_size) > 0.5).float() - 1 diag_l = 2 * (torch.rand(tensor_size) > 0.5).float() - 1 random_matrix = entry_distribution((tensor_size, tensor_size)) p_matrix = torch.diag(diag_p) l_matrix = torch.diag(diag_l) p_matrix = p_matrix + torch.triu(random_matrix, diagonal=1) l_matrix = l_matrix + torch.tril(random_matrix, diagonal=-1) yield torch.matmul(p_matrix, l_matrix) def cob_entry_prob_distribution(size): full_size = int(np.prod(size)) vals = torch.tensor([-1, 0, 1]) probs = torch.tensor([0.0075, 0.985, 0.0075]).unsqueeze(0) cum_sum = torch.cumsum(probs, dim=-1) unif_prob = torch.rand((full_size, 1)) tensor_idx = torch.argmax((unif_prob <= cum_sum).int(), dim=1) tensor = vals[tensor_idx] return tensor.reshape(size) class ChangeOfBasis: """Change of Basis class.""" """Change of Basis class.""" def __init__( self, tensor_size: int, n_cob: int, cob_prob: float, device: str, random_seed: int = None, ): """Builds a ChangeOfBasis object. Args: tensor_size (int): Size of the tensor. n_cob (int): Number of change of basis matrices. cob_prob (float): Probability of applying a change of basis. device (str): Name of the torch device to use. random_seed (int, optional): Random seed for reproducibility. """ self.tmp_dir = Path.home() / ".data_alpha_tensor/cob_matrices" self.tmp_dir.mkdir(exist_ok=True, parents=True) for i, cob_matrix in enumerate( get_change_basis_matrix( tensor_size, n_cob, cob_entry_prob_distribution, random_seed ) ): torch.save(cob_matrix, f"{self.tmp_dir}/cob_matrix_{i}.pt") self.tensor_size = tensor_size self.n_cob = n_cob self.cob_prob = cob_prob self.device = device @torch.no_grad() def __call__(self, tensor: torch.Tensor, return_basis: bool = False): """Apply a change of basis to a tensor. Args: tensor (torch.Tensor): Tensor to apply the change of basis to. return_basis (bool, optional): Whether to return the change of basis matrix as well. """ cob_prob = torch.rand(1).item() if cob_prob > self.cob_prob: return tensor random_cob = torch.randint(low=0, high=self.n_cob, size=(1,)) cob_matrix = torch.load( f"{self.tmp_dir}/cob_matrix_{int(random_cob)}.pt" ).to(self.device) # apply change of basis to each tensor dimension inner_tensor = tensor[0, 0] tensor_size = inner_tensor.shape[-1] original_shape = inner_tensor.shape cob_matrix = cob_matrix.transpose(0, 1) inner_tensor = torch.matmul( inner_tensor.reshape(-1, tensor_size), cob_matrix ).reshape(original_shape) inner_tensor = inner_tensor.permute(0, 2, 1) inner_tensor = torch.matmul( inner_tensor.reshape(-1, tensor_size), cob_matrix ).reshape(original_shape) inner_tensor = inner_tensor.permute(2, 1, 0) inner_tensor = torch.matmul( inner_tensor.reshape(-1, tensor_size), cob_matrix ).reshape(original_shape) inner_tensor = inner_tensor.permute(2, 0, 1) tensor[0, 0] = inner_tensor if return_basis: return tensor, cob_matrix.transpose(0, 1) return tensor ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/dataset.py ================================================ import json import os import shutil import tempfile from pathlib import Path from typing import List, Tuple import numpy as np import torch from torch.utils.data import Dataset from open_alpha_tensor.core.data.generation import generate_synthetic_data from open_alpha_tensor.core.data.utils import ( get_scalars, map_triplet_to_action, ) SAVE_DIR_SYNT = str(Path.home() / ".data_alpha_tensor/synthetic_data") def compute_move(triplets: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): """Computes the outer product of the three tensors in the triplet that will be subtracted from the current state. Args: triplets (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Tensors u, v, and w. """ u, v, w = triplets return u.reshape(-1, 1, 1) * v.reshape(1, -1, 1) * w.reshape(1, 1, -1) class SyntheticDataBuffer(Dataset): """Dataset of synthetically generated demonstrations.""" def __init__( self, tensor_size, n_data, limit_rank, prob_distr, n_prev_actions: int, device: str, n_steps: int, random_seed=None, ): """Builds a dataset of synthetic demonstrations. Args: tensor_size (int): Size of the tensor. n_data (int): Number of demonstrations to generate. limit_rank (int): Maximum rank of the generated tensors. prob_distr (Callable): Probability distribution to use to generate the tensors. n_prev_actions (int): Number of previous actions to use as input. device (str): Name of the torch device to use. n_steps (int): Number of steps to perform in the environment. random_seed (int, optional): Random seed to use. """ self.device = device self.len_data = 0 self.n_prev_actions = n_prev_actions self.limit_rank = limit_rank self.n_steps = n_steps self.save_dir = os.path.join(SAVE_DIR_SYNT, f"size_{tensor_size}") Path(self.save_dir).mkdir(parents=True, exist_ok=True) number_of_triplets = len(list(Path(self.save_dir).glob("*.pt"))) // 2 if number_of_triplets < n_data: self.len_data = number_of_triplets for i, (output_tensor, list_of_triplets) in enumerate( generate_synthetic_data( tensor_size, n_data - number_of_triplets, limit_rank, prob_distr, random_seed, ) ): torch.save( output_tensor, os.path.join( self.save_dir, f"output_tensor_{self.len_data}.pt" ), ) torch.save( list_of_triplets, os.path.join( self.save_dir, f"list_of_triplets_{self.len_data}.pt" ), ) self.len_data += 1 else: self.len_data = n_data def __len__(self): return self.len_data * self.limit_rank @torch.no_grad() def __getitem__(self, idx): i = idx // self.limit_rank j = idx % self.limit_rank output_tensor = torch.load( os.path.join(self.save_dir, f"output_tensor_{i}.pt") ) list_of_triplets = torch.load( os.path.join(self.save_dir, f"list_of_triplets_{i}.pt") ) if j != self.limit_rank - 1: moves = list_of_triplets[j + 1 :] # noqa E203 output_tensor = self._apply_moves(output_tensor, moves) triplet = list_of_triplets[j] output_tensor = torch.stack( [ output_tensor, *( compute_move(t) for t in reversed( list_of_triplets[ j + 1 : j + 1 + self.n_prev_actions # noqa E203 ] ) ), ] ) if len(output_tensor) < self.n_prev_actions + 1: output_tensor = torch.cat( [ output_tensor, torch.zeros( self.n_prev_actions + 1 - len(output_tensor), *output_tensor.shape[1:], ), ] ) policy = map_triplet_to_action(triplet, base=5, n_steps=self.n_steps) reward = torch.tensor([-(j + 1)]) scalar = get_scalars(output_tensor, self.limit_rank - j, with_bs=False) return ( output_tensor.to(self.device), scalar.to(self.device), policy.to(self.device), reward.to(self.device), ) @staticmethod def _apply_moves( tensor: torch.Tensor, moves: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]], ): """Given an initial state and a list of moves, applies the moves to the state. Args: tensor (torch.Tensor): Initial state. moves (List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]): List of moves. """ for u, v, w in moves: tensor = tensor - u.reshape(-1, 1, 1) * v.reshape( 1, -1, 1 ) * w.reshape(1, 1, -1) return tensor class GameDataBuffer(Dataset): """Buffer to store the data from the games played by the MCTS agent.""" def __init__(self, device: str, max_buffer_size: int): """Initializes the buffer. Args: device (str): Name of the torch device to use. max_buffer_size (int): Maximum size of the buffer. """ self.num_games = 0 self.temp_dir = tempfile.mkdtemp("game_data_buffer") self.game_data = {} self.max_buffer_size = max_buffer_size self.device = device def __del__(self): shutil.rmtree(self.temp_dir) def add_game( self, states: List[torch.Tensor], policies: List[torch.Tensor], rewards: List[torch.Tensor], ): """Adds a played game to the buffer. Args: states (List[torch.Tensor]): Observed game states. policies (List[torch.Tensor]): List of policies. rewards (List[torch.Tensor]): Observed rewards. """ self.game_data[self.num_games] = len(states) torch.save( states, os.path.join(self.temp_dir, f"states_{self.num_games}.pt") ) torch.save( policies, os.path.join(self.temp_dir, f"policies_{self.num_games}.pt"), ) torch.save( rewards, os.path.join(self.temp_dir, f"rewards_{self.num_games}.pt"), ) self.num_games += 1 if self.num_games >= self.max_buffer_size: # remove oldest game. Note that this line is not thread safe. Lock # should be added if multiple threads are used. self.num_games = 0 def __len__(self): return sum(self.game_data.values()) @torch.no_grad() def __getitem__(self, idx): i = 0 while idx >= self.game_data[i]: idx -= self.game_data[i] i += 1 states = torch.load(os.path.join(self.temp_dir, f"states_{i}.pt")) policies = torch.load(os.path.join(self.temp_dir, f"policies_{i}.pt")) rewards = torch.load(os.path.join(self.temp_dir, f"rewards_{i}.pt")) return ( states[idx].to(self.device), get_scalars(states[idx], idx, with_bs=False).to(self.device), policies[idx].to(self.device).argmax(dim=-1), rewards[idx].to(self.device).reshape(1), ) def save_game_data(self, path: str): """Copy save_dir content in path and save game_data in json format """ shutil.copytree(self.temp_dir, path, dirs_exist_ok=True) with open(os.path.join(path, "game_data.json"), "w") as f: json.dump(self.game_data, f) def load_game_data(self, path: str): """Load game_data from json format and copy content in save_dir """ with open(os.path.join(path, "game_data.json"), "r") as f: self.game_data = json.load(f) shutil.copytree(path, self.temp_dir) self.num_games = len(self.game_data) class TensorGameDataset(Dataset): """Dataset to be used for training the AlphaTensor algorithm using both actor generated and synthetic data. A basis change can be applied to both the data type with a probability specified in the constructor. The synthetic data and the actor generated one are stored in two data buffers. """ def __init__( self, len_data, pct_synth, tensor_size, n_synth_data, limit_rank, prob_distr, action_memory_len: int, device: str, n_steps: int, random_seed=None, ): self.synthetic_data_buffer = SyntheticDataBuffer( tensor_size, n_synth_data, limit_rank, prob_distr, action_memory_len, n_steps=n_steps, device=device, random_seed=random_seed, ) self.game_data_buffer = GameDataBuffer( device=device, max_buffer_size=100000 ) self.best_game_data_buffer = GameDataBuffer( device=device, max_buffer_size=1000 ) self.len_data = len_data self.pct_synth = pct_synth self.pct_best_game = 0 self.synth_bool = torch.ones(len_data, dtype=torch.bool) self.synth_idx = torch.from_numpy( np.random.choice( len(self.synthetic_data_buffer), len_data, replace=False ) ) self.game_idx = None self.best_game_idx = None self.action_memory_len = action_memory_len self.tensor_size = tensor_size self.device = device def change_training_split(self, pct_synth, pct_best_game): self.pct_synth = pct_synth self.pct_best_game = pct_best_game def recompute_synthetic_indexes(self): if len(self.game_data_buffer) > 0: self.synth_bool = torch.rand(self.len_data) < self.pct_synth len_synth_data = self.synth_bool.sum().item() self.synth_idx = torch.from_numpy( np.random.choice( len(self.synthetic_data_buffer), len_synth_data, replace=False, ) ) if len(self.best_game_data_buffer) > 0 and self.pct_best_game > 0: len_game_data = int( (1 - self.pct_synth - self.pct_best_game) * self.len_data ) replace_game = len_game_data > len(self.game_data_buffer) len_best_game_data = ( self.len_data - len_synth_data - len_game_data ) replace_best_game = len_best_game_data > len( self.best_game_data_buffer ) self.game_idx = torch.from_numpy( np.random.choice( len(self.game_data_buffer), len_game_data, replace=replace_game, ) ) self.best_game_idx = torch.from_numpy( np.random.choice( len(self.best_game_data_buffer), len_best_game_data, replace=replace_best_game, ) ) else: len_game_data = self.len_data - len_synth_data replace_game = len_game_data > len(self.game_data_buffer) self.game_idx = torch.from_numpy( np.random.choice( len(self.game_data_buffer), len_game_data, replace=replace_game, ) ) def __getitem__(self, idx): if self.synth_bool[idx]: return self.synthetic_data_buffer[ self.synth_idx[self.synth_bool[:idx].sum()] ] else: if self.pct_best_game > 0 and self.best_game_idx is not None: if idx - self.synth_bool[:idx].sum() < len(self.best_game_idx): return self.best_game_data_buffer[ self.best_game_idx[idx - self.synth_bool[:idx].sum()] ] else: return self.game_data_buffer[ self.game_idx[ idx - self.synth_bool[:idx].sum() - len(self.best_game_idx) ] ] else: return self.game_data_buffer[ self.game_idx[idx - self.synth_bool[:idx].sum()] ] def __len__(self): return self.len_data def add_game( self, states: List[torch.Tensor], policies: List[torch.Tensor], rewards: List[torch.Tensor], ): self.game_data_buffer.add_game(states, policies, rewards) def add_best_game( self, states: List[torch.Tensor], policies: List[torch.Tensor], rewards: List[torch.Tensor], ): self.best_game_data_buffer.add_game(states, policies, rewards) def save_game_data(self, path): self.game_data_buffer.save_game_data(os.path.join(path, "game_data")) self.best_game_data_buffer.save_game_data( os.path.join(path, "best_game_data") ) def load_game_data(self, path): self.game_data_buffer.load_game_data(os.path.join(path, "game_data")) self.best_game_data_buffer.load_game_data( os.path.join(path, "best_game_data") ) @property def input_tensor(self) -> torch.Tensor: max_matrix_size = int(np.sqrt(self.tensor_size)) input_tensor = torch.zeros( 1, self.action_memory_len + 1, self.tensor_size, self.tensor_size, self.tensor_size, ) matrix_dims = ( torch.randint(1, max_matrix_size, (3,)) .detach() .cpu() .numpy() .tolist() ) operation_tensor = self._build_tensor_game_input( *matrix_dims, action_memory_len=self.action_memory_len ) input_tensor[ 0, :, : operation_tensor.shape[1], : operation_tensor.shape[2], : operation_tensor.shape[3], ] = operation_tensor return input_tensor.to(self.device) @staticmethod def _build_tensor_game_input( dim_1: int, dim_k: int, dim_2: int, action_memory_len: int ): """Build the input tensor for the game. The input tensor has shape (action_memory_len+1, matrix_size**2, matrix_size**2, matrix_size**2). The first slice represent the matrix multiplication tensor which will be reduced by the TensorGame algorithm. The other slices represent the action memory. """ input_tensor = torch.zeros( action_memory_len + 1, dim_1 * dim_k, dim_k * dim_2, dim_1 * dim_2 ) for r in range(dim_1 * dim_2): for k in range(dim_k): input_tensor[ 0, (r // dim_2) * dim_k + k, k * dim_2 + r % dim_2, r ] = 1 return input_tensor def games_are_good(self): return False ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/generation.py ================================================ from typing import Callable import torch def generate_synthetic_data( tensor_size: int, n_data: int, limit_rank: int, prob_distr: Callable = torch.randn, random_seed: int = None, ): """Generates synthetic demonstrations. Args: tensor_size (int): Size of the tensor. n_data (int): Number of demonstrations. limit_rank (int): Limit rank of each tensor. prob_distr (Callable, optional): Distribution of the entries of the tensor. random_seed (int, optional): Random seed for reproducibility. """ if random_seed is not None: torch.random.manual_seed(random_seed) for _ in range(n_data): # rank = torch.randint(low=1, high=limit_rank + 1, size=(1,)).item() rank = limit_rank output_tensor = torch.zeros(tensor_size, tensor_size, tensor_size) list_of_triplets = [] for i in range(rank): valid_triplet = False while not valid_triplet: u = prob_distr(tensor_size) v = prob_distr(tensor_size) w = prob_distr(tensor_size) generated_tensor = ( u.reshape(-1, 1, 1) * v.reshape(1, -1, 1) * w.reshape(1, 1, -1) ) if not (generated_tensor == 0).all(): valid_triplet = True list_of_triplets.append((u, v, w)) output_tensor += generated_tensor yield output_tensor, list_of_triplets def f_prob_distribution(size): """Samples a tensor of values from a distribution with a peak at 0 and a tail at -2 and 2. Args: size (int): Number of values to sample. """ f_vals = torch.tensor([-2, -1, 0, 1, 2]) f_probs = torch.tensor([0.001, 0.099, 0.8, 0.099, 0.001]).unsqueeze(0) f_cum_sum = torch.cumsum(f_probs, dim=-1) unif_prob = torch.rand((size, 1)) tensor_idx = torch.argmax((unif_prob <= f_cum_sum).int(), dim=1) tensor = f_vals[tensor_idx] return tensor def z2_prob_distribution(size): """Samples a binary tensor with uniform probability of 0 and 1. Args: size (int): Number of values to sample. """ return (torch.rand(size) > 0.5).int() ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/utils.py ================================================ from typing import Tuple import torch def get_scalars(input_tensor: torch.Tensor, t_step: int, with_bs: bool = True): """Adds the time step to the current state tensor. Args: input_tensor (torch.Tensor): Current state tensor. t_step (int): Current time step. with_bs (bool, optional): Whether the batch size is present in the input tensor. """ # scalars containing the iteration time if with_bs: bs = input_tensor.shape[0] scalars = torch.zeros((bs, 1)) scalars[:, 0] = t_step else: scalars = torch.tensor(t_step).unsqueeze(-1).float() return scalars def map_triplet_to_action( triplet: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], base: int, n_steps: int, add_bias: bool = True, ): """Maps a triplet of tensors to an action. Args: triplet (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Triplet of tensors u, v, and w. base (int): Base used for the conversion. n_steps (int): Number of steps in the action. add_bias (bool, optional): Whether to add a bias to the action. """ # map the triplet to an action. First, we concatenate the three tensors and # then we convert it to an action using the given base representation. Each # element is converted using the formula: # action += element * base^(element_index) u, v, w = triplet n_dim = u.ndim action = torch.cat((u, v, w), dim=-1) action = action.reshape(-1, n_steps, action.shape[-1] // n_steps) if n_dim == 1: action = action.squeeze(0) if add_bias: action = action + base // 2 action = action * torch.tensor( [base**i for i in range(action.shape[-1])] ) action = action.sum(dim=-1) return action # @torch.jit.script def _single_action_to_triplet( action_val: int, basis: int, out_dim: int, bias: int, device: str, ): """Converts an action to the original triplet (u, v, w) that generated it. Args: action_val (int): Action to convert. basis (int): Basis used for the conversion. out_dim (int): Output dimension. bias (int): Bias to subtract from the action. device (str): Name of the torch device to use. """ triplet = torch.zeros(out_dim).to(device) if action_val > 0: idx = int( torch.log(torch.tensor(action_val)) // torch.log(torch.tensor(basis)) ) else: idx = 0 while idx >= 0: temp = int(basis**idx) triplet[idx] = action_val // temp - bias action_val = action_val - temp idx -= 1 return triplet def map_action_to_triplet( action_tensor: torch.Tensor, cardinality: int = 5, vector_size: int = 5, add_bias: bool = True, ): """Maps a batch of actions to the batch of triplets that generated them. Args: action_tensor (torch.Tensor): Batch of actions. cardinality (int, optional): Cardinality of the action space. vector_size (int, optional): Size of the vector. add_bias (bool, optional): Whether to use bias. """ # map the action to a triplet. The action is converted to a base 5 # representation and then the three elements are extracted from it. # The action has shape (bs, n_steps) and it contains the token for # recreating u, v and w. The token is a number between 0 and n_logits. action_shape = action_tensor.shape action_tensor = action_tensor.reshape(-1) if add_bias: bias = cardinality // 2 else: bias = 0 triplets = torch.stack( [ _single_action_to_triplet( action_tensor[idx], cardinality, vector_size, bias, action_tensor.device, ) for idx in range(len(action_tensor)) ] ) final_size = triplets.shape[-1] return triplets.reshape((*action_shape, final_size)) ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/__init__.py ================================================ ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/alpha_tensor.py ================================================ import torch from open_alpha_tensor.core.modules.extras import ( QuantileLoss, ValueRiskManagement, ) from open_alpha_tensor.core.modules.heads import PolicyHead, ValueHead from open_alpha_tensor.core.modules.torso import TorsoModel class AlphaTensorModel(torch.nn.Module): def __init__( self, tensor_length: int, input_size: int, scalars_size: int, emb_dim: int, n_steps: int, n_logits: int, n_samples: int, ): # scalar_size = s # input_size = S # tensor_length = T # emb_dim = c super().__init__() self.tensor_length = tensor_length self.input_size = input_size self.emb_dim = emb_dim self.torso = TorsoModel( scalars_size, input_size, tensor_length, emb_dim ) emb_size = 3 * input_size * input_size print("Build policy head") self.policy_head = PolicyHead( emb_size, emb_dim, n_steps, n_logits, n_samples ) print("Build value head") self.value_head = ValueHead( 2048 ) # value dependent on num_head and proj_dim self.policy_loss_fn = torch.nn.CrossEntropyLoss(reduction="sum") self.quantile_loss_fn = QuantileLoss() self.risk_value_management = ValueRiskManagement() @property def device(self): return next(self.parameters()).device def _train_forward( self, x: torch.Tensor, s: torch.Tensor, g_action: torch.Tensor, g_value: torch.Tensor, ): # shapes # x = (N, T, S, S, S) # s = (N, s) # g_action = (N, N_steps) # g_value = (N, ) e = self.torso(x, s) o, z1 = self.policy_head(e, g_action) l_policy = self.policy_loss_fn( o.reshape(-1, o.shape[-1]), g_action.reshape(-1) ) q = self.value_head(z1) l_value = self.quantile_loss_fn(q, g_value.float()) return l_policy, l_value def _eval_forward(self, x: torch.Tensor, s: torch.Tensor): e = self.torso(x, s) a, p, z1 = self.policy_head(e) q = self.value_head(z1) q = self.risk_value_management(q) return a, p, q def forward( self, x: torch.Tensor, s: torch.Tensor, g_action: torch.Tensor = None, g_value: torch.Tensor = None, ): if g_action is None: return self._eval_forward(x, s) else: assert g_value is not None return self._train_forward(x, s, g_action, g_value) @property def n_logits(self): return self.policy_head.n_logits @property def n_steps(self): return self.policy_head.n_steps @property def n_samples(self): return self.policy_head.n_samples ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/attention.py ================================================ import torch from torch.nn import functional as F class AttentionHead(torch.nn.Module): def __init__(self, x_size: int, y_size: int, proj_dim: int): # x_size = N_x # y_size = N_y super(AttentionHead, self).__init__() self.proj_dim = proj_dim self.proj_dim_isqrt = 1 / torch.sqrt(torch.tensor(proj_dim)) self.queries_proj_layer = torch.nn.Linear(x_size, proj_dim) self.keys_proj_layer = torch.nn.Linear(y_size, proj_dim) self.values_proj_layer = torch.nn.Linear(y_size, proj_dim) def forward(self, x: torch.Tensor, y: torch.Tensor, mask: bool = False): queries = self.queries_proj_layer(x) keys = self.keys_proj_layer(y) values = self.values_proj_layer(y) attention = F.softmax( torch.matmul(queries, keys.transpose(-2, -1)) * self.proj_dim_isqrt, dim=-1, ) if mask: attention = torch.triu(attention, diagonal=1) output = torch.matmul(attention, values) return output class AttentionDenseBlock(torch.nn.Module): def __init__(self, inner_size: int, multiplier: int = 4): super().__init__() self.norm = torch.nn.LayerNorm(inner_size) self.linear = torch.nn.Linear(inner_size, inner_size * multiplier) self.activation = torch.nn.GELU() self.linear_final = torch.nn.Linear( inner_size * multiplier, inner_size ) def forward(self, x: torch.Tensor): x_temp = self.activation(self.linear(self.norm(x))) return x + self.linear_final(x_temp) class AlphaMultiHeadAttention(torch.nn.Module): def __init__( self, x_dim: int, y_dim: int, proj_dim: int = 32, n_heads: int = 16, multiplier: int = 4, ): # x_dim = size of the last dimension of x # y_dim = size of the last dimension of y super().__init__() self.norm_layer_x = torch.nn.LayerNorm(x_dim) self.norm_layer_y = torch.nn.LayerNorm(y_dim) self.module_list = torch.nn.ModuleList( [AttentionHead(x_dim, y_dim, proj_dim) for _ in range(n_heads)] ) self.linear = torch.nn.Linear(n_heads * proj_dim, x_dim) self.dense = AttentionDenseBlock(x_dim, multiplier) def forward( self, x: torch.nn.Module, y: torch.nn.Module, mask: bool = False ): # x.size = (Nx, c1), y.size = (Ny, c2) x_norm = self.norm_layer_x(x) y_norm = self.norm_layer_y(y) temp = torch.cat( [layer(x_norm, y_norm, mask) for layer in self.module_list], dim=-1 ) x = x + self.linear(temp) return self.dense(x) ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/extras.py ================================================ import torch class QuantileLoss(torch.nn.Module): def __init__(self, delta: float = 1.0): super().__init__() self.huber_loss = torch.nn.HuberLoss(reduction="none", delta=delta) def forward(self, q: torch.Tensor, g: torch.Tensor): n = q.shape[-1] tau = torch.arange(0, n).unsqueeze(0).to(q.device) / n h = self.huber_loss(g, q) k = torch.abs(tau - (g - q > 0).float()) return torch.mean(h * k) class ValueRiskManagement(torch.nn.Module): def __init__(self, u_q: float = 0.75): super(ValueRiskManagement, self).__init__() self.u_q = u_q def forward(self, q: torch.Tensor): # q shape = (N, n) j = int(self.u_q * q.shape[-1]) return torch.mean(q[:, j:], dim=-1) ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/heads.py ================================================ import math import torch import torch.nn.functional as F from open_alpha_tensor.core.modules.attention import AlphaMultiHeadAttention class PositionEncoding(torch.nn.Module): def __init__(self, d_model: int, max_len: int = 5000): super().__init__() position = torch.arange(max_len).unsqueeze(1) div_term = torch.exp( torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model) ) pe = torch.zeros(max_len, 1, d_model) pe[:, 0, 0::2] = torch.sin(position * div_term) pe[:, 0, 1::2] = torch.cos(position * div_term) self.register_buffer("pe", pe) def forward(self, x: torch.Tensor) -> torch.Tensor: """ Args: x: Tensor, shape [seq_len, batch_size, embedding_dim] """ x = x + self.pe[: x.size(0)] return x class PolicyHeadDoubleAttention(torch.nn.Module): def __init__( self, n_steps: int, n_heads: int, n_feat: int, emb_size: int, emb_dim: int, ): super().__init__() d_model = n_feat * n_heads self.layer_norm1 = torch.nn.LayerNorm(d_model) self.attention1 = AlphaMultiHeadAttention(d_model, d_model) self.drop1 = torch.nn.Dropout() self.layer_norm2 = torch.nn.LayerNorm(d_model) self.attention2 = AlphaMultiHeadAttention(d_model, emb_dim) self.drop2 = torch.nn.Dropout() def forward(self, x: torch.Tensor, e: torch.Tensor): x = self.layer_norm1(x) c = self.attention1(x, x, mask=True) c = self.drop1(c) x = x + c x = self.layer_norm2(x) c = self.attention2(x, e, mask=False) c = self.drop2(c) x = x + c return x class PolicyHeadCore(torch.nn.Module): def __init__( self, emb_size: int, emb_dim: int, n_steps: int, n_logits: int, n_feat: int = 64, n_heads: int = 32, n_layers: int = 2, ): super().__init__() self.embedding = torch.nn.Embedding(n_logits, n_feat * n_heads) self.position_encoding = PositionEncoding(n_feat * n_heads) self.decoders = torch.nn.ModuleList( [ PolicyHeadDoubleAttention( n_steps, n_heads, n_feat, emb_size, emb_dim ) for _ in range(n_layers) ] ) self.relu = torch.nn.ReLU() self.linear2 = torch.nn.Linear(n_feat * n_heads, n_logits) def forward(self, a: torch.Tensor, e: torch.Tensor): x = self.position_encoding(self.embedding(a)) for layer in self.decoders: x = layer(x, e) o = self.linear2(self.relu(x)) return o, x def sample_from_logits(a): # returns a sampled element and the associated probability # since cross entropy is run during training we expect logits # to be probabilities yet. probs = torch.cumsum(F.softmax(a, dim=-1), dim=-1) random_vals = torch.rand(probs.shape[0]).unsqueeze(-1).to(a.device) n_classes = a.shape[-1] new_a_idx = torch.argmax(1.0 * (probs > random_vals), dim=-1) index_bias = torch.arange(0, len(new_a_idx)).to(a.device) * n_classes probs = torch.take(probs, new_a_idx + index_bias) # new_a = F.one_hot(new_a_idx, n_classes) return new_a_idx, probs class PolicyHead(torch.nn.Module): def __init__( self, emb_size: int, emb_dim: int, n_steps: int, n_logits: int, n_samples: int, ): super().__init__() self.n_logits = n_logits self.n_samples = n_samples self.n_steps = n_steps self.core = PolicyHeadCore(emb_size, emb_dim, n_steps, n_logits) def _train_forward(self, e: torch.Tensor, g: torch.Tensor): # e is the embedding, shape = (N, m, c) # g represents the previous actions, when training it represents the # list of correct actions, thus we need to shift them (since we do not # want to consider also the latest, correct action when predicting). # g has shape (N, N_steps) and it is a one-hot encoding of N_logits g = torch.roll(g, shifts=-1, dims=1) # the first raw will have attention zero during training # g = F.one_hot(g, self.n_logits).float() o, z = self.core(g, e) return o, z[:, 0] def _eval_forward(self, e: torch.Tensor): bs = e.shape[0] future_g = ( torch.zeros((bs, self.n_samples, self.n_steps)).long().to(e.device) ) ps = torch.ones((bs, self.n_samples)).to(e.device) e = e.unsqueeze(1).repeat(1, self.n_samples, 1, 1) future_g = future_g.view(-1, self.n_steps) ps = ps.view(-1) e = e.view(-1, e.shape[-2], e.shape[-1]) for i in range(self.n_steps): o_s, z_s = self.core(future_g[:, : i + 1], e) future_g[:, i], p_i = sample_from_logits(o_s[:, i]) ps *= p_i future_g = future_g.view(bs, self.n_samples, self.n_steps) ps = ps.view(bs, self.n_samples) return ( future_g, ps, z_s[:, 0].view(bs, self.n_samples, *z_s.shape[2:]).mean(1), ) def forward(self, e: torch.Tensor, g: torch.Tensor = None): if g is None: return self._eval_forward(e) return self._train_forward(e, g) class ValueHeadCore(torch.nn.Module): def __init__(self, input_size: int, output_size: int): super().__init__() self.linear = torch.nn.Linear(input_size, output_size) self.relu = torch.nn.ReLU() def forward(self, x: torch.Tensor): return self.relu(self.linear(x)) class ValueHead(torch.nn.Module): def __init__( self, input_size: int, hidden_size: int = 512, output_size: int = 8 ): super().__init__() self.layers = torch.nn.Sequential( *( [ValueHeadCore(input_size, hidden_size)] + [ValueHeadCore(hidden_size, hidden_size)] * 2 ) ) self.linear = torch.nn.Linear(hidden_size, output_size) def forward(self, x: torch.Tensor): return self.linear(self.layers(x)) ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/torso.py ================================================ import torch from open_alpha_tensor.core.modules.attention import AlphaMultiHeadAttention class TorsoAttentiveModes(torch.nn.Module): def __init__(self, input_dim: int): # input_dim = c super().__init__() self.attention = AlphaMultiHeadAttention( input_dim, input_dim, ) def forward(self, x1, x2, x3): # x1.size = x2.size = x3.size = (N, S, S, c) # where N is the batch size size = x1.shape[-2] input_list = [x1, x2, x3] for m1, m2 in [(0, 1), (2, 0), (1, 2)]: matrix = torch.cat([input_list[m1], input_list[m2]], dim=-2) # matrix_size = (N, S, 2S, c) out = self.attention(matrix, matrix) input_list[m1] = out[:, :, :size] input_list[m2] = out[:, :, size:] return input_list class TorsoModel(torch.nn.Module): """Torso model of OpenAlphaTensor. It maps an input tensor of shape (N, T, S, S, S) to (N, 3S*S, c), where: N is the batch size; T is the context size (size of the history + 1); S is the number of elements in each matrix to be multiplied; c is the output dimensionality. """ def __init__( self, scalars_size: int, input_size: int, tensor_length: int, out_size: int, ): # scalar_size = s # input_size = S # tensor_length = T # out_size = c super(TorsoModel, self).__init__() self.linears_1 = torch.nn.ModuleList( [ torch.nn.Linear(scalars_size, input_size * input_size) for _ in range(3) ] ) self.linears_2 = torch.nn.ModuleList( [ torch.nn.Linear(input_size * tensor_length + 1, out_size) for _ in range(3) ] ) self.attentive_modes = torch.nn.ModuleList( [TorsoAttentiveModes(out_size) for _ in range(8)] ) def forward(self, x: torch.Tensor, scalars: torch.Tensor): # x.size = (N, T, S, S, S) # scalars.size = (N, s) batch_size = x.shape[0] S = x.shape[-1] T = x.shape[1] x1 = x.permute(0, 2, 3, 4, 1).reshape(batch_size, S, S, S * T) x2 = x.permute(0, 4, 2, 3, 1).reshape(batch_size, S, S, S * T) x3 = x.permute(0, 3, 4, 2, 1).reshape(batch_size, S, S, S * T) input_list = [x1, x2, x3] for i in range(3): temp = self.linears_1[i](scalars).reshape(batch_size, S, S, 1) input_list[i] = torch.cat([input_list[i], temp], dim=-1) input_list[i] = self.linears_2[i](input_list[i]) x1, x2, x3 = input_list for layer in self.attentive_modes: x1, x2, x3 = layer(x1, x2, x3) return torch.stack([x1, x2, x3], dim=2).reshape( batch_size, 3 * S * S, -1 ) ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/training.py ================================================ from pathlib import Path from typing import Tuple, List import torch.optim import tqdm from torch.utils.data import DataLoader from open_alpha_tensor.config import ( BASE_CHECKPOINT_DATA_DIR, BASE_CHECKPOINT_DIR, ) from open_alpha_tensor.core.actors.stage import actor_prediction from open_alpha_tensor.core.data.basis_change import ChangeOfBasis from open_alpha_tensor.core.data.dataset import TensorGameDataset from open_alpha_tensor.core.data.generation import f_prob_distribution from open_alpha_tensor.core.data.utils import map_action_to_triplet from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel @torch.no_grad() def _single_act( actor_id: int, model: torch.nn.Module, input_tensor: torch.Tensor, device: str, mc_n_sim: int, N_bar: int, cob: ChangeOfBasis, max_rank: int, ): """Executes an episode for a single actor using the MCTS. The method is called multiple times in parallel with different actor ids. Args: actor_id (int): The id of the actor. model (torch.nn.Module): The model used to take the action. input_tensor (torch.Tensor): State of the game. device (str): The name of the torch device used for training. mc_n_sim (int): Number of simulations during Monte Carlo tree search. N_bar (int): N_bar parameter used to compute tau when improving the policy. cob (ChangeOfBasis): The change of basis used to generate the input tensor. max_rank (int): The maximum matrix rank achieved by the actor before tree search is stopped. """ print(f"Acting with actor {actor_id}") model.to(device) cob.device = device input_tensor = input_tensor.to(device) input_tensor_cob = cob(input_tensor) states, policies, rewards = actor_prediction( model, input_tensor_cob, max_rank, mc_n_sim, N_bar ) print(f"Actor {actor_id} finished") states = [s.to("cpu") for s in states] policies = policies.to("cpu") rewards = rewards.to("cpu") return actor_id, states, policies, rewards def swap_data( states: List[torch.Tensor], actions: List[torch.Tensor], ): """Swaps the last action with a random one and updates the states accordingly for a single game. Args: states (List[torch.Tensor]): All the states for a single game. actions (List[torch.Tensor]): All the actions through the game. """ last_action = actions[-1] swap_index = torch.randint(0, len(states) - 1, (1,)).item() actions[-1] = actions[swap_index] actions[swap_index] = last_action actual_state = states[swap_index] for i in range(swap_index + 1, len(states) + 1): prev_action = actions[i - 1] triplet = map_action_to_triplet( prev_action, vector_size=actual_state.shape[-1] ) vector_size = actual_state.shape[-1] // 3 bs = actual_state.shape[0] u = triplet[:, :vector_size].reshape(bs, -1, 1, 1) v = triplet[:, vector_size : 2 * vector_size].reshape( # noqa E203 bs, 1, -1, 1 ) w = triplet[:, 2 * vector_size :].reshape(bs, 1, 1, -1) # noqa E203 reduced_state = u * v * w fut_state = actual_state[:, 0] - reduced_state new_state = actual_state[:, 1:].roll(1, dims=1) new_state[:, 0] = reduced_state actual_state = torch.cat([fut_state, new_state], dim=1) states[i] = actual_state return states, actions class Trainer: """Trainer for the AlphaTensor model. The trainer does not require an explicit loss since the loss is computed by the model itself. The trainer is responsible for both the training step and the acting one, storing acting performance in a buffer. """ def __init__( self, model: AlphaTensorModel, tensor_size: int, n_steps: int, batch_size: int, optimizer: torch.optim.Optimizer, device: str, len_data: int, pct_synth: float, n_synth_data: int, limit_rank: int, n_cob: int, cob_prob: float, data_augmentation: bool, loss_params: Tuple[float, float] = None, random_seed: int = None, checkpoint_dir: str = None, checkpoint_data_dir: Path = None, extra_devices: List[str] = None, ): """Initializes the trainer. Args: model (AlphaTensorModel): The model to train. tensor_size (int): Flattened size of the matrices to be multiplied. n_steps (int): Number of steps used to get a single action out of a triplet. batch_size (int): Batch size. optimizer (torch.optim.Optimizer): The optimizer used to train the model. device (str): The name of the torch device used for training. len_data (int): Number of training samples used (both actor generated and synthetic). pct_synth (float): Initial percentage of synthetic samples used for training. n_synth_data (int): Number of synthetic training samples. limit_rank (int): Maximum rank for synthetically-generated matrices. n_cob (int): Number of change of basis (cob) used for a single training sample. cob_prob (float): Probability of applying a change of basis. data_augmentation (bool): Whether to randomly swap the last operation of an episode with another operation. loss_params (Tuple[float, float]): Alpha and Beta parameters used in the loss function. random_seed (int): Randomizing seed. checkpoint_dir (str): Directory used to store model checkpoints. checkpoint_data_dir (str): Directory used to store games as JSON files. extra_devices (List[str]): Extra devices names used for multi-GPU training. """ self.model = model self.optimizer = optimizer self.device = device self.dataset = TensorGameDataset( len_data, pct_synth, tensor_size, n_synth_data, limit_rank, f_prob_distribution, device=device, n_steps=n_steps, action_memory_len=(model.tensor_length - 1), random_seed=random_seed, ) self.batch_size = batch_size self.max_rank = limit_rank if loss_params is None: self.alpha = 1 self.beta = 1 else: self.alpha, self.beta = loss_params self.checkpoint_dir = Path( checkpoint_dir if checkpoint_dir else BASE_CHECKPOINT_DIR ) self.checkpoint_dir.mkdir(exist_ok=True, parents=True) self.checkpoint_data_dir = ( checkpoint_data_dir if checkpoint_data_dir else Path(BASE_CHECKPOINT_DATA_DIR) ) self.checkpoint_data_dir.mkdir(exist_ok=True, parents=True) self.change_of_basis = ChangeOfBasis( tensor_size, n_cob, cob_prob, device, random_seed ) self.data_augmentation = data_augmentation self.extra_devices = extra_devices def train_step(self): """Executes a single training step by optimizing the current model parameters.""" self.dataset.recompute_synthetic_indexes() self.model.train() total_loss = 0 dl = DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True) print("Training AlphaTensor") for states, scalars, policies, rewards in tqdm.tqdm(dl): loss_policy, loss_value = self.model( states, scalars, policies, rewards ) loss = self.alpha * loss_policy + self.beta * loss_value self.optimizer.zero_grad() loss.backward() self.optimizer.step() total_loss += loss.item() print(f"Total loss: {total_loss}") @torch.no_grad() def act_step( self, input_tensor: torch.Tensor, n_games: int, mc_n_sim: int, N_bar: int, ): """Runs actors in parallel to generate multiple games starting from the same input tensor. Args: input_tensor (torch.Tensor): The input tensor used to generate the games. n_games (int): Number of games to generate / actors to be run in parallel. mc_n_sim (int): Number of simulations used in the Monte Carlo tree search. N_bar (int): N_bar parameter used to compute tau when improving the policy. """ self.model.eval() best_reward = -1e10 best_game = None if self.extra_devices: from joblib import Parallel, delayed # this means that there is an empty GPU available # thus we can use it to parallelize the acting step # use joblib to parallelize the acting step # we should use _single_act as a function to be parallelized extra_devices = ( self.extra_devices * (n_games // len(self.extra_devices)) + self.extra_devices[: n_games % len(self.extra_devices)] ) self.model.to("cpu") input_tensor = input_tensor.to("cpu") print(f"Starting acting phase with {n_games} games") results = Parallel(n_jobs=len(self.extra_devices))( delayed(_single_act)( actor_id, self.model, input_tensor, extra_devices[actor_id], mc_n_sim, N_bar, self.change_of_basis, self.max_rank, ) for actor_id in range(n_games) ) self.model.to(self.device) for actor_id, states, policies, rewards in results: if rewards[-1] > best_reward: print(f"New best actor! Actor: {actor_id}") best_reward = rewards[-1] best_game = (states, policies, rewards) self.dataset.add_game(states, policies, rewards) if self.data_augmentation: states, policies = swap_data(states, policies) self.dataset.add_game(states, policies, rewards) if best_game is not None: self.dataset.add_best_game(*best_game) else: for actor_id in range(n_games): input_tensor_cob = self.change_of_basis(input_tensor).to( self.device ) print(f"Running actor {actor_id} / {n_games}") states, policies, rewards = actor_prediction( self.model, input_tensor_cob, self.max_rank, mc_n_sim, N_bar, ) print( f"Actor {actor_id} finished. Final reward: {rewards[-1]}" ) if rewards[-1] > best_reward: print("New best actor!") best_reward = rewards[-1] best_game = (states, policies, rewards) self.dataset.add_game(states, policies, rewards) if self.data_augmentation: states, policies = swap_data(states, policies) self.dataset.add_game(states, policies, rewards) if best_game is not None: self.dataset.add_best_game(*best_game) def train( self, n_epochs: int, n_games: int, mc_n_sim: int, N_bar: int, initial_lr: float, lr_decay_factor: float, lr_decay_steps: int, starting_epoch: int = 0, ): """Trains the model for a given number of epochs. Args: n_epochs (int): Number of training epochs. n_games (int): Number of games to generate / actors to be run in parallel at each step. mc_n_sim (int): Number of simulations used in the Monte Carlo tree search at each step. N_bar (int): N_bar parameter used to compute tau when improving the policy. initial_lr (float): Initial learning rate. lr_decay_factor (float): Learning rate's decay factor. lr_decay_steps (int): Number of learning rate's decay steps. starting_epoch (int, optional): Epoch from which to start / resume training. """ self.model = self.model.to(self.device) if starting_epoch + 1 > n_epochs // 50: self.dataset.change_training_split(0.7, 0.05) if ( starting_epoch + 1 > n_epochs // 10 ): # when restarting from a checkpoint mc_n_sim = mc_n_sim * 4 for epoch in range(starting_epoch, n_epochs): if epoch + 1 == n_epochs // 50: self.dataset.change_training_split(0.7, 0.05) if epoch + 1 == n_epochs // 10: mc_n_sim = mc_n_sim * 4 # apply learning rate decay each epoch if epoch < lr_decay_steps if 0 < epoch < lr_decay_steps - 1: lr = initial_lr * lr_decay_factor ** (epoch / lr_decay_steps) for param_group in self.optimizer.param_groups: param_group["lr"] = lr print(f"Epoch {epoch} / {n_epochs}") self.train_step() if epoch % 10 == 0: self.act_step( self.dataset.input_tensor, n_games, mc_n_sim, N_bar ) # save checkpoint if (epoch + 1) % 100 == 0: checkpoint_name = f"checkpoint_{epoch + 1}.pt" checkpoint = { "model_state_dict": self.model.state_dict(), "optimizer_state_dict": self.optimizer.state_dict(), } torch.save( checkpoint, self.checkpoint_dir / checkpoint_name, ) self.dataset.save_game_data(self.checkpoint_data_dir) # exit strategy if self.dataset.games_are_good(): break print("Training finished") ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/__init__.py ================================================ ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/checkpoint_op.py ================================================ from pathlib import Path from typing import Any import torch from nebullvm.operations.base import Operation from open_alpha_tensor.config import ( BASE_CHECKPOINT_DATA_DIR, BASE_CHECKPOINT_DIR, ) from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel from open_alpha_tensor.core.training import Trainer def optimizer_to(optim: torch.optim.Optimizer, device: str): for param in optim.state.values(): # Not sure there are any global tensors in the state dict if isinstance(param, torch.Tensor): param.data = param.data.to(device) if param._grad is not None: param._grad.data = param._grad.data.to(device) elif isinstance(param, dict): for subparam in param.values(): if isinstance(subparam, torch.Tensor): subparam.data = subparam.data.to(device) if subparam._grad is not None: subparam._grad.data = subparam._grad.data.to(device) class LoadCheckPointOp(Operation): """An operation which loads a checkpoint during training of an OpenAlphaTensor model.""" def __init__(self): super().__init__() self._last_epoch = None self._model = None self._optimizer = None def execute( self, model: AlphaTensorModel, optimizer: torch.optim.Optimizer, checkpoint_dir: str, ): """Load a checkpoint from a directory. Args: model: The model to load the checkpoint into. optimizer: The optimizer to load the checkpoint into. checkpoint_dir: The directory to load the checkpoint from. """ checkpoint_dir = checkpoint_dir or BASE_CHECKPOINT_DIR if ( Path(checkpoint_dir).exists() and len(list(Path(checkpoint_dir).glob("*.pt"))) > 0 ): def key_func(x): return int(x.stem.split("_")[-1]) checkpoint_path = sorted( Path(checkpoint_dir).glob("*.pt"), key=key_func )[-1] print(f"Loading checkpoint from {checkpoint_path}") old_device = model.device checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["model_state_dict"]) model.to(old_device) print(f"Loaded model to {old_device}") optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) optimizer_to(optimizer, old_device) last_epoch = int(checkpoint_path.stem.split("_")[-1]) else: last_epoch = 0 self._last_epoch = last_epoch self._model = model self._optimizer = optimizer def get_last_epoch(self) -> int: """Returns the last epoch of the loaded checkpoint.""" return self._last_epoch def get_model(self) -> AlphaTensorModel: """Returns the model loaded from the checkpoint.""" return self._model def get_optimizer(self) -> torch.optim.Optimizer: """Returns the optimizer loaded from the checkpoint.""" return self._optimizer def get_result(self) -> Any: pass class LoadCheckpointDataOp(Operation): """An operation which loads the games played while training an OpenAlphaTensor model.""" def __init__(self): super().__init__() self._loaded = False def execute(self, games_store_dir: Path, trainer: Trainer): """Load the games played while training an OpenAlphaTensor model. Args: games_store_dir: The directory where the games are stored. trainer: The trainer to load the games into. """ games_store_dir = games_store_dir or BASE_CHECKPOINT_DATA_DIR # if games_store_dir contains games, load them if ( games_store_dir.exists() and (games_store_dir / "game_data.json").exists() ): trainer.dataset.load_games(games_store_dir) self._loaded = True def get_result(self) -> bool: """Returns whether the games were loaded or not.""" return self._loaded ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/model_op.py ================================================ import json from pathlib import Path from typing import Any import torch from nebullvm.operations.base import Operation from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel class BuildModelOp(Operation): """An operation which builds an OpenAlphaTensor model.""" def __init__(self): super().__init__() self._model = None def execute( self, tensor_length: int, input_size: int, scalars_size: int, emb_dim: int, n_steps: int, n_logits: int, n_samples: int, ): """Builds the OpenAlphaTensor model. Args: tensor_length (int): Number of tensors to as history. input_size (int): Flattened size of the matrices to be multiplied. scalars_size (int): Size of the scalar vectors fed to the torso model. emb_dim (int): Embedding dimension. n_steps (int): Number of steps used to get a single action out of a triplet. n_logits (int): Number of logits output by the policy head. n_samples (int): Number of samples used by the policy head at evaluation time. """ self._model = AlphaTensorModel( tensor_length=tensor_length, input_size=input_size, scalars_size=scalars_size, emb_dim=emb_dim, n_steps=n_steps, n_logits=n_logits, n_samples=n_samples, ) def get_model(self) -> AlphaTensorModel: """Returns the built model.""" return self._model def get_result(self) -> Any: pass class BuildOptimizerOp(Operation): """An operation which builds an optimizer for an OpenAlphaTensor model.""" def __init__(self): super().__init__() self._optimizer = None def execute( self, optimizer_name: str, model: AlphaTensorModel, lr: float, weight_decay: float, ): """Builds the optimizer for the OpenAlphaTensor model. Args: optimizer_name (str): Name of the optimizer used. model (AlphaTensorModel): OpenAlphaTensor model to be trained. lr (float): Learning rate. weight_decay (float): Weight decay used by the optimizer. """ if optimizer_name == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=lr) elif optimizer_name == "adamw": optimizer = torch.optim.AdamW( model.parameters(), lr=lr, weight_decay=weight_decay ) elif optimizer_name == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=lr) else: raise ValueError(f"Optimizer {optimizer_name} not supported") self._optimizer = optimizer def get_optimizer(self) -> torch.optim.Optimizer: """Returns the built optimizer.""" return self._optimizer def get_result(self) -> Any: pass class SaveModelOp(Operation): """An operation which saves an OpenAlphaTensor model. The model parameters are stored in a json file, while the model weights are stored in a .pt file.""" def get_result(self) -> Any: pass def execute( self, model: AlphaTensorModel, save_dir: str, ): """Saves the OpenAlphaTensor model. Args: model (AlphaTensorModel): OpenAlphaTensor model to be saved. save_dir (str): Directory where the model will be saved. """ save_dir = Path(save_dir if save_dir else ".") save_dir.mkdir(parents=True, exist_ok=True) torch.save(model.state_dict(), save_dir / "final_model.pt") model_params = { "input_size": model.input_size, "tensor_length": model.tensor_length, "scalars_size": 1, "emb_dim": model.emb_dim, "n_steps": model.n_steps, "n_logits": model.n_logits, "n_samples": model.n_samples, } # save parameters in a json file with open(save_dir / "model_params.json", "w") as f: json.dump(model_params, f) ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/training_op.py ================================================ from pathlib import Path from typing import Tuple, Any, List import torch.optim from nebullvm.operations.base import Operation from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel from open_alpha_tensor.core.training import Trainer from open_alpha_tensor.operations.checkpoint_op import LoadCheckpointDataOp class TrainingOperation(Operation): """Operation which trains an AlphaTensor model to learn more efficient matrix multiplications.""" def __init__(self): super().__init__() self._trained_model = None self._load_checkpoint_data_op = LoadCheckpointDataOp() def execute( self, model: AlphaTensorModel, input_size: int, n_steps: int, batch_size: int, optimizer: torch.optim.Optimizer, device: str, len_data: int, pct_synth: float, n_synth_data: int, limit_rank: int, max_epochs: int, n_actors: int, mc_n_sim: int, N_bar: int, last_epoch: int, lr: float, lr_decay_factor: float, lr_decay_steps: int, loss_params: Tuple[float, float] = None, random_seed: int = None, checkpoint_dir: str = None, checkpoint_data_dir: str = None, n_cob: int = 0, cob_prob: float = 0.0, data_augmentation: bool = False, extra_devices: List[str] = None, ): """Trains an AlphaTensor model to learn more efficient matrix multiplications. Args: model (AlphaTensorModel): The model to be trained. input_size (int): Flattened size of the matrices to be multiplied. n_steps (int): Number of steps used to get a single action out of a triplet. batch_size (int): Batch size. optimizer (torch.optim.Optimizer): The optimizer used for training. device (str): The name of the torch device used for training. len_data (int): Number of training samples used (both actor generated and synthetic). pct_synth (float): Initial percentage of synthetic samples used for training. n_synth_data (int): Number of synthetic training samples. limit_rank (int): Maximum rank for synthetically-generated matrices. max_epochs (int): Number of training epochs. n_actors (int): Number of actors to play a single each game at each training step. mc_n_sim (int): Number of simulations during Monte Carlo tree search. N_bar (int): N_bar parameter used to compute tau when improving the policy. last_epoch (int): Latest epoch reached during training from which checkpoint data will be loaded. lr (float): Learning rate. lr_decay_factor (float): Learning rate's decay factor. lr_decay_steps (int): Number of learning rate's decay steps. loss_params (Tuple[float, float]): Alpha and Beta parameters used in the loss function. random_seed (int): Randomizing seed. checkpoint_dir (str): Directory used to store model checkpoints. checkpoint_data_dir (str): Directory used to store games as JSON files. n_cob (int): Number of change of basis (cob) used for a single training sample. cob_prob (float): Probability of applying a change of basis. data_augmentation (bool): Whether to randomly swap the last operation of an episode with another operation. extra_devices (List[str]): Extra devices names used for multi-GPU training. """ checkpoint_data_dir = Path(checkpoint_data_dir or "games") # build trainer trainer = Trainer( model=model, tensor_size=input_size, n_steps=n_steps, batch_size=batch_size, optimizer=optimizer, device=device, len_data=len_data, pct_synth=pct_synth, n_synth_data=n_synth_data, limit_rank=limit_rank, loss_params=loss_params, random_seed=random_seed, checkpoint_dir=checkpoint_dir, checkpoint_data_dir=checkpoint_data_dir, data_augmentation=data_augmentation, cob_prob=cob_prob, n_cob=n_cob, extra_devices=extra_devices, ) # load checkpoint data self._load_checkpoint_data_op.execute( games_store_dir=checkpoint_data_dir, trainer=trainer, ) # train trainer.train( n_epochs=max_epochs, n_games=n_actors, mc_n_sim=mc_n_sim, N_bar=N_bar, starting_epoch=last_epoch, initial_lr=lr, lr_decay_factor=lr_decay_factor, lr_decay_steps=lr_decay_steps, ) self._trained_model = trainer.model def get_trained_model(self): """Returns the trained model.""" return self._trained_model def get_result(self) -> Any: pass ================================================ FILE: optimization/open_alpha_tensor/open_alpha_tensor/root_op.py ================================================ from typing import Tuple, List from nebullvm.operations.base import Operation from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel from open_alpha_tensor.operations.checkpoint_op import LoadCheckPointOp from open_alpha_tensor.operations.model_op import ( BuildModelOp, SaveModelOp, BuildOptimizerOp, ) from open_alpha_tensor.operations.training_op import TrainingOperation class TrainAlphaTensorRootOp(Operation): """Root operation which trains an AlphaTensor model to learn more efficient matrix multiplications.""" def __init__(self): super().__init__() self._model = None self._optimizer = None self._build_model_op = BuildModelOp() self._build_optimizer_op = BuildOptimizerOp() self._load_checkpoint_op = LoadCheckPointOp() self._training_op = TrainingOperation() self._save_model_op = SaveModelOp() def execute( self, tensor_length: int, input_size: int, scalars_size: int, emb_dim: int, n_steps: int, n_logits: int, n_samples: int, optimizer_name: str, lr: float, lr_decay_factor: float, lr_decay_steps: int, weight_decay: float, loss_params: Tuple[float, float], checkpoint_dir: str, checkpoint_data_dir: str, epochs: int, batch_size: int, len_data: int, n_synth_data: int, pct_synth: float, limit_rank: int, n_actors: int, mc_n_sim: int, N_bar: int, device: str, save_dir: str, random_seed: int, n_cob: int, cob_prob: float, data_augmentation: bool, extra_devices: List[str], ): """Trains an AlphaTensor model to learn more efficient matrix multiplications. Args: tensor_length (int): Number of step tensors fed to the model (history and current state), input_size (int): Flattened size of the matrices to be multiplied, scalars_size (int): Size of the scalar vectors fed to the torso model, emb_dim (int): Embedding dimension, n_steps (int): Number of steps used to get a single action out of a triplet, n_logits (int): Number of logits output by the policy head, n_samples (int): Number of samples used by the policy head at evaluation time, optimizer_name (str): Name of the optimizer used, lr (float): Learning rate, lr_decay_factor (float): Learning rate's decay factor, lr_decay_steps (int): Number of learning rate's decay steps, weight_decay (float): Weight decay used by the optimizer, loss_params (Tuple[float, float]): Alpha and Beta parameters used in the loss function, checkpoint_dir (str): Directory used to store model checkpoints, checkpoint_data_dir (str): Directory used to store games as JSON files, epochs (int): Number of training epochs, batch_size (int): Batch size, len_data (int): Number of training samples used (both actor generated and synthetic), n_synth_data (int): Number of synthetic training samples, pct_synth (float): Initial percentage of synthetic samples used for training, limit_rank (int): Maximum rank for synthetically-generated matrices, n_actors (int): Number of actors to play a single each game at each training step, mc_n_sim (int): Number of simulations during Monte Carlo tree search, N_bar (int): N_bar parameter used to compute tau when improving the policy, device (str): The name of the torch device used for training, save_dir (str): Directory where the final trained model will be stored, random_seed (int): Randomizing seed, n_cob (int): Number of change of basis (cob) used for a single training sample, cob_prob (float): Probability of applying a change of basis, data_augmentation (bool): Whether to randomly swap the last operation of an episode with another operation, extra_devices (List[str]): Extra devices names used for multi-GPU training. """ if self._model is None: self._build_model_op.execute( tensor_length=tensor_length, input_size=input_size, scalars_size=scalars_size, emb_dim=emb_dim, n_steps=n_steps, n_logits=n_logits, n_samples=n_samples, ) self._model = self._build_model_op.get_model().to(device) if self._build_model_op.get_model() is not None: self._build_optimizer_op.execute( optimizer_name=optimizer_name, model=self._build_model_op.get_model(), lr=lr, weight_decay=weight_decay, ) self._optimizer = self._build_optimizer_op.get_optimizer() if self._model is not None and self._optimizer is not None: self._load_checkpoint_op.execute( self._model, self._optimizer, checkpoint_dir ) if self._load_checkpoint_op.get_model() is not None: self._model = self._load_checkpoint_op.get_model() self._optimizer = self._load_checkpoint_op.get_optimizer() starting_epoch = self._load_checkpoint_op.get_last_epoch() self._training_op.execute( model=self._model, input_size=input_size, n_steps=n_steps, batch_size=batch_size, optimizer=self._optimizer, device=device, len_data=len_data, pct_synth=pct_synth, n_synth_data=n_synth_data, limit_rank=limit_rank, max_epochs=epochs, n_actors=n_actors, mc_n_sim=mc_n_sim, N_bar=N_bar, last_epoch=starting_epoch, lr=lr, lr_decay_factor=lr_decay_factor, lr_decay_steps=lr_decay_steps, loss_params=loss_params, random_seed=random_seed, checkpoint_dir=checkpoint_dir, checkpoint_data_dir=checkpoint_data_dir, n_cob=n_cob, cob_prob=cob_prob, data_augmentation=data_augmentation, extra_devices=extra_devices, ) if self._training_op.get_trained_model() is not None: self._model = self._training_op.get_trained_model() self._save_model_op.execute( model=self._model, save_dir=save_dir, ) def get_result(self) -> AlphaTensorModel: """Returns the trained torch model""" return self._model ================================================ FILE: optimization/open_alpha_tensor/resources/open_alpha_tensor.md ================================================ # Open Source Implementation of DeepMind’s AlphaTensor Matrix multiplication is a fundamental operation used in many systems, from neural networks to scientific computing routines. Finding efficient and provably correct algorithms for matrix multiplication can have a huge impact on making computation faster and more efficient, but is a very challenging task. The space of possible algorithms is enormous, and traditional methods for discovering algorithms, such as human-designed heuristics or combinatorial search, are often suboptimal. [DeepMind](https://www.deepmind.com/)'s recently proposed an AI-based solution for automated search that goes far beyond human intuition. The solution consists of a deep reinforcement learning agent called AlphaTensor, built on top of [AlphaZero](https://www.deepmind.com/blog/alphazero-shedding-new-light-on-chess-shogi-and-go). This agent is trained to play a single-player game, TensorGame, where the goal is to discover computationally efficient algorithms for matrix multiplication. AlphaTensor is particularly good at handling large matrices by decomposing large matrix multiplications into smaller multiplications. Moreover, AlphaTensor can be used to achieve state-of-the-art performance for matrix multiplication once fine-tuned on a specific hardware device. AlphaTensor has great potential for accelerating deep learning computing. In deep learning, many time-consuming operations can be mapped to matrix multiplications. By using AlphaTensor to optimize these operations, the overall performance of deep learning models can be significantly improved. In this article, we will explore DeepMind's AlphaTensor architecture and algorithm and how it discovers new efficient algorithms by playing the TensorGame. Next, we will examine the [first open-source implementation of AlphaTensor](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/open_alpha_tensor), and unresolved challenges to potentially revolutionize the computational performance of deep learning models with AlphaTensors. ![deepmind-4QVqSh4VvP4-unsplash](https://user-images.githubusercontent.com/83510798/221407730-77526b8f-b363-4716-9945-6ccd518632e5.jpg) Photo by [DeepMind](https://unsplash.com/@deepmind?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText) on [Unsplash](https://unsplash.com/photos/4QVqSh4VvP4) # What is DeepMind’s AlphaTensor? AlphaTensor is a reinforcement learning algorithm based on the AlphaZero algorithm and trained to play a simple one-player game: the TensorGame. This game consists in finding the tensor decomposition of a three-dimensional tensor representing the matrix multiplication. ### Matrix Multiplication Tensor For non-experts in Matrix Multiplication optimization, it may not be straightforward to understand how an operation, such as a matrix multiplication, can be mapped in a three-dimensional tensor. I will try to explain it in simple words and with examples. Let’s consider the product `C = A*B`, where for simplicity both A and B are square matrices of size N. The multiplication operation can be mapped in a 3D tensor of shape `(N^2, N^2, N^2)` . The first tensor dimension represents the flatten matrix A, the second dimension the flatten matrix B and the third dimension the flatten matrix C. The tensor has only binary values (either 1 or 0) for each entry. Note that the tensor represents the multiplication operation, so it is independent of the values of the matrices A and B. Every entry of the tensor corresponds to the coefficient of the operation. For example, to compute C[1,1], it is necessary to multiply both A[1,1] and B[1,1]. Therefore, the tensor entry [0,0,0], which corresponds to A[1,1], B[1,1] and C[1,1], will have value 1. In contrast, to compute C[1,1], A[2,1] is not needed. Thus, the tensor row T[N+1, :, 0] will contain only zeros. The image below from [DeepMind’s paper](https://www.marktechpost.com/2023/02/20/a-new-ai-approach-using-embedding-recycling-er-can-make-language-model-development-more-efficient-with-2x-faster-training-and-1-8x-speedup-in-inference/) shows an example of a tensor for N=2. Screen Shot 2023-02-26 at 12 33 26 PM As shown in (b) and (c) in the figure above, it is possible to implement an algorithm for computing the product using a decomposition of the 3D tensor. More specifically, the algorithm below can be used for converting a tensor decomposition (the matrices U, V, W) in a matrix multiplication algorithm. Screen Shot 2023-02-26 at 1 36 10 PM ## The TensorGame The problem of finding efficient algorithms for matrix multiplication is extremely challenging because the number of possible algorithms to consider is much larger than the number of atoms in the universe, even for small instances of matrix multiplication. DeepMind converted this problem into a single-player game, and called it the TensorGame. In this game, the player chooses how to combine different entries of matrices to multiply them. A score is assigned based on the number of operations required to achieve the correct multiplication result. The game ends when the zero tensor is reached or when the maximum number of moves has been made. The final factorization is evaluated based on an estimation of the residual rank and certain optimization criteria, such as asymptotic time complexity or practical runtime. The initial position in the TensorGame corresponds to the Matrix Multiplication Tensor expressed on some random basis. In each step t **of the game, the player writes down three vectors $\vec{u}(t), \vec{v}(t), \vec{w}(t)$, which specifies the rank-1 tensors $\vec{u} \otimes \vec{v} \otimes \vec{w}$. The state of the game is updated by subtracting the vectors selected by the player: $$ \tilde{S}_{t+1} = \tilde{S}_{t} - \vec{u} \otimes \vec{v} \otimes \vec{w} $$ where $\tilde{S}_0$ is the Matrix Multiplication Tensor. If the game ends in p steps, this means that the Matrix Multiplication Tensor $\tilde S_0$ can be decomposed into p rank-1 tensors $\vec{u} \otimes \vec{v} \otimes \vec{w}$, i.e. it has at least rank p. The TensorGame can then be interpreted as a rank decomposition algorithm and AlphaTensor can be seen as an algorithm for estimating the rank of the tensor. ## AlphaTensor Architecture So far we have learned about the TensorGame and clarified how its solution can be seen as a matrix multiplication algorithm. Let’s now explore the main concepts of AlphaTensor, the algorithm used for the game. AlphaTensor architecture is basically an encoder-decoder Transformer architecture where: - the encoder takes as input the game state $\tilde S_t$, the n previous actions taken by the model (usually n=7) and the time index t **of the current action. Information is stacked together in a tensor with shape `(n+1, N^2, N^2, N^2)` . This tensor is then reshaped and transformed (using three linear layers) in a tensor of shape `(N^2, N^2, c)` where c is the inner dimension of the model. - the decoder generates the `n_steps` actions from the embedded vector given by the encoder in an auto-regressive way. Each action corresponds to a token of the triplets $(\vec{u}, \vec{v}, \vec{w})$ representing one of the triplets decomposing the game tensor (i.e. reducing its rank) The model is trained by alternating back-propagation and model acting. Model acting is used to generate data that is then used to train the model. In practice, the model is trained with a mixture of synthetically generated data and data generated by the model during acting. The acting step is done by taking a 3D tensor corresponding to a matrix operation and playing `n_actors` games on it. Each actor plays a game either on the standard basis or on an alternative basis (the change of basis is applied with a given probability). The results are then collected and can be used in the training step with the synthetic data. The acting step is based on AlphaZero's Monte Carlo Tree Search (MCTS), modified to support large action spaces. In short, before choosing the action, `n_sims` paths are explored from the model output with a maximum future exploration of 5 steps. The probabilities generated by the model are then adjusted taking into account the generated paths. Then the action with the most promising future path(s) is chosen to continue the game. While training the model, the reward is actually a negative reward (penalty). Its absolute value increases with each additional step required to solve the game. If the model takes `m` steps to solve a TensorGame, the reward associated with the game is `r=-m.` If the model is not able to solve the TensorGame in `max_rank` steps, the reward is computed by estimating the rank of the remaining tensor. The rank is estimated as the sum of the ranks of the matrices that compose the tensor. The estimate is an upper bound on the true rank of the tensor. When fine-tuning the model, the penalty reward at the terminal state should also take into account the latency of the algorithm produced by the model. The reward formula becomes `rt'=rt+λbt`, where `rt` is the reward scheme described earlier, `bt` is the benchmark reward (non-zero only at the terminal state), and *`λ`* is a user-specified coefficient. Screen Shot 2023-02-26 at 1 37 12 PM The image above from DeepMind's paper shows the speed-ups (%) of AlphaTensor-discovered algorithms tailored for a GPU and a TPU, extracted from DeepMind’s paper. Speed-ups are measured relative to standard (e.g. cuBLAS for the GPU) matrix multiplication on the same hardware and compared to the Strassen-square algorithm. # The Open Source Implementation of DeepMind’s AlphaTensor [OpenAlphaTensor](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/open_alpha_tensor) is the first open source implementation of AlphaTensor and was developed by [Diego Fiori](https://www.linkedin.com/in/diego-fiori-/). Let's discover more about the implementation. As we discussed earlier, the AlphaTensor architecture is fairly straightforward, based on a standard transformer with an encoder-decoder architecture. The most interesting components of AlphaTensor are the first layer in the encoder part and the way the actions are sampled. Let’s start with the first encoding layer. ```python # x.size = (N, T, S, S, S) # scalars.size = (N, s) batch_size = x.shape[0] S = x.shape[-1] T = x.shape[1] x1 = x.permute(0, 2, 3, 4, 1).reshape(batch_size, S, S, S * T) x2 = x.permute(0, 4, 2, 3, 1).reshape(batch_size, S, S, S * T) x3 = x.permute(0, 3, 4, 2, 1).reshape(batch_size, S, S, S * T) input_list = [x1, x2, x3] for i in range(3): temp = self.linears_1[i](scalars).reshape(batch_size, S, S, 1) input_list[i] = torch.cat([input_list[i], temp], dim=-1) input_list[i] = self.linears_2[i](input_list[i]) x1, x2, x3 = input_list ``` In the snippet above, we show how the input tensor is decomposed into three tensors, which are then used as query, key and value inputs of the transformer-layer. 1. Across the three tensor dimensions representing the flattened matrices (A, B, C), the input tensor is flattened along each dimension together with the dimension representing the previous actions. In this way, in each flattened-copy of the input tensor, the selected dimension is an aggregation of the last T-1 values and the actual value, for all the S values of the selected dimension, where S=N^2. Philosophically, it is as if, for each dimension, we focus on what happened in the previous actions in that dimension. 2. The scalars are mapped in three different spaces of dimension S^2, and then reshaped to be concatenated with the tensors obtained at the previous point. Conceptually, the scalars are mapped to an embedding space of dimension S^2, and then the embedded information is chunked into S vectors and stacked together, similar to what happens to text when tokenized. 3. Scalar tokens are concatenated with the restructured input tensor and then given as input to a linear layer for mapping the scalars+channel-history focus information in the internal dimension of the model. These three steps can be interpreted as a way of giving to the model both information about the scalars (as in the TensorGame time step) and the focus on the previous actions for each channel. Regarding the way the actions are produced, it is interesting to note that AlphaTensor generates as output the triplet u, v, w, which aims to reduce the tensor rank. The three vectors have size S and since they are concatenated the model has to produce a vector of size 3*S. AlphaTensor is trained with a RL algorithm, so all possible actions must be expressed in terms of probabilities in an enumerated space, i.e. the model produces a probability over the different actions. This means that each vector in the 3S space should be mapped to a different action. This results in an action space of size |F|^(3S), where |F| is the number of different values that the element of u, v, w can take. Usually the values are restricted to (-2, -1, 0, 1, 2), resulting in a cardinality of 5 elements. Here comes a major challenge: to generate the action probabilities for a matrix product of matrices of size 5 we would need a memory of 5^75 * 4 bytes, which would mean `~10^44 GB` of memory. Clearly we cannot manage such a large action space. How do we solve the problem? To reduce the memory footprint of the action probabilities we can split the triplets into smaller chunks, “tokenize” them, and threaten the chunks as generated tokens in the transformer architecture, i.e. the tokens are given as input to the decoder in an auto-regressive way. In the example above we can split the triplets into 15 chunks, reducing the memory consumption to `15 * 5^(75/15) * 4`, i.e. `187.5 KB`. ```python def _eval_forward(self, e: torch.Tensor): bs = e.shape[0] future_g = ( torch.zeros((bs, self.n_samples, self.n_steps)).long().to(e.device) ) ps = torch.ones((bs, self.n_samples)).to(e.device) e = e.unsqueeze(1).repeat(1, self.n_samples, 1, 1) future_g = future_g.view(-1, self.n_steps) ps = ps.view(-1) e = e.view(-1, e.shape[-2], e.shape[-1]) for i in range(self.n_steps): o_s, z_s = self.core(future_g[:, : i + 1], e) future_g[:, i], p_i = sample_from_logits(o_s[:, i]) ps *= p_i future_g = future_g.view(bs, self.n_samples, self.n_steps) ps = ps.view(bs, self.n_samples) return ( future_g, ps, z_s[:, 0].view(bs, self.n_samples, *z_s.shape[2:]).mean(1), ) ``` Above we show the code snippet for generating the full action. In the code, `self.core` contains the decoder layer and the tensor `e` represents the output of the encoder layer. Zero can be considered as the `` token in NLP models and the `n_steps` actions representing the `n_steps` chunks are generated in a progressive way. The model returns three quantities: 1. The generated actions 2. The probability associated with the full action 3. The logits produced for generating the first action (the first chunk) that will be used for computing the model value. It is worth spending a few words on the `n_samples` parameter. The parameter is used for the acting step and it allows the model to generate different versions of the triplets which will then be used for exploring the action space in the Monte Carlo Tree Search algorithm used in the Acting process. The `n_samples` different actions are sampled accordingly to the policy generated by the model. ## Acting Step The most tricky part of the whole algorithm is probably the Acting step used for solving the TensorGame. The algorithm is not deeply explained in the AlphaTensor paper, since it is based on several DeepMind’s previous papers which are just cited and given as known. Here, I’ll re-compose all the missing pieces and explain step by step our implementation. We can organize the acting steps in three different components: - The Monte-Carlo Tree Search - The game simulation - The Improved policy computation ### Monte-Carlo Tree Search (MCTS) Monte Carlo Tree Search (MCTS) is a widely used artificial intelligence technique for game playing, particularly in board games and video games. The algorithm creates a game tree that simulates potential moves and outcomes and uses random sampling to evaluate the expected reward for each move. The algorithm then repeatedly selects the move with the highest expected reward and continues simulating outcomes until it reaches a terminal state or a specified stopping condition. The simulations are used to estimate the probability of winning for each move and guide the decision-making process. MCTS has been shown to be effective in complex games where the number of possible moves and outcomes is large, and it has been used in successful game-playing AI systems, such as AlphaGo. In AlphaTensor a modified version of the original MCTS is used. In particular, instead of randomly selecting the action from the whole action space, the action is selected among a subset generated directly by the model (through the `n_samples` presented before). The correction to the policy upgrade is then applied in the **Improved Policy computation** step. In our implementation, we decided to keep all the information about the Monte-Carlo tree in a dictionary having as key the hash-version of the TensorGame state and as values the information associated with the state itself. Each Monte-Carlo step starts from a node and simulate `n_sim` mini-games, exploring the future with a horizon of 5 moves. If the node has already been explored in previous simulations, n_sim is adjusted considering the number of previous exploration. For each node the number of visits is stored in the `N_s_a` tensor, since this tensor contains the number of visits per node child action (among the ones sampled by the model). ```python def monte_carlo_tree_search( model: torch.nn.Module, state: torch.Tensor, n_sim: int, t_time: int, n_steps: int, game_tree: Dict, state_dict: Dict, ): """Runs the monte carlo tree search algorithm. Args: model (torch.nn.Module): The model to use for the simulation. state (torch.Tensor): The initial state. n_sim (int): The number of simulations to run. t_time (int): The current time step. n_steps (int): The maximum number of steps to simulate. game_tree (Dict): The game tree. state_dict (Dict): The dictionary containing the states. """ state_hash = to_hash(extract_present_state(state)) if state_hash in state_dict: with torch.no_grad(): N_s_a = state_dict[state_hash][3] n_sim -= int(N_s_a.sum()) n_sim = max(n_sim, 0) for _ in range(n_sim): simulate_game(model, state, t_time, n_steps, game_tree, state_dict) # return next state possible_states_dict, _, repetitions, N_s_a, q_values, _ = state_dict[ state_hash ] possible_states = _recompose_possible_states(possible_states_dict) next_state_idx = select_future_state( possible_states, q_values, N_s_a, repetitions, return_idx=True ) next_state = possible_states[next_state_idx] return next_state ``` The code above shows our implementation of the algorithm. For a matter of code simplicity the policy correction is performed in the `simulate_game` function. ### Game Simulation The `simulate_game` function is responsible for exploring the tree composed of nodes representing a particular state of the TensorGame. It also runs the model whenever a leaf node is encountered and it stores all node information in the `state_dict` dictionary. Let’s give a deep look at its implementation: ```python @torch.no_grad() def simulate_game( model, state: torch.Tensor, t_time: int, max_steps: int, game_tree: Dict, states_dict: Dict, horizon: int = 5, ): """Simulates a game from a given state. Args: model: The model to use for the simulation. state (torch.Tensor): The initial state. t_time (int): The current time step. max_steps (int): The maximum number of steps to simulate. game_tree (Dict): The game tree. states_dict (Dict): The states dictionary. horizon (int): The horizon to use for the simulation. """ idx = t_time max_steps = min(max_steps, t_time + horizon) state_hash = to_hash(extract_present_state(state)) trajectory = [] # selection while state_hash in game_tree: ( possible_states_dict, old_idx_to_new_idx, repetition_map, N_s_a, q_values, actions, ) = states_dict[state_hash] possible_states = _recompose_possible_states(possible_states_dict) state_idx = select_future_state( possible_states, q_values, N_s_a, repetition_map, return_idx=True ) trajectory.append((state_hash, state_idx)) # state_hash, action_idx future_state = extract_present_state(possible_states[state_idx]) state = possible_states[state_idx] state_hash = to_hash(future_state) idx += 1 # expansion if idx <= max_steps: trajectory.append((state_hash, None)) if not game_is_finished(extract_present_state(state)): state = state.to(model.device) scalars = get_scalars(state, idx).to(state.device) actions, probs, q_values = model(state, scalars) ( possible_states, cloned_idx_to_idx, repetitions, not_dupl_indexes, ) = extract_children_states_from_actions( state, actions, ) not_dupl_actions = actions[:, not_dupl_indexes].to("cpu") not_dupl_q_values = torch.zeros(not_dupl_actions.shape[:-1]).to( "cpu" ) N_s_a = torch.zeros_like(not_dupl_q_values).to("cpu") present_state = extract_present_state(state) states_dict[to_hash(present_state)] = ( _reduce_memory_consumption_before_storing(possible_states), cloned_idx_to_idx, repetitions, N_s_a, not_dupl_q_values, not_dupl_actions, ) game_tree[to_hash(present_state)] = [ to_hash(extract_present_state(fut_state)) for fut_state in possible_states ] leaf_q_value = q_values else: leaf_q_value = -int(torch.linalg.matrix_rank(state).sum()) # backup backward_pass(trajectory, states_dict, leaf_q_value=leaf_q_value) ``` Each simulation is divided in three parts: - Selection - Expansion - Backup In the `selection` part the simulation is run on the already generated tree-nodes, and the following node is selected using the following function: ```python def select_future_state( possible_states: List[torch.Tensor], q_values: torch.Tensor, N_s_a: torch.Tensor, repetitions: Dict[int, list], c_1: float = 1.25, c_2: float = 19652, return_idx: bool = False, ) -> torch.Tensor: """Select the future state maximizing the upper confidence bound.""" # q_values (1, K, 1) pi = torch.tensor( [ len(repetitions[i]) for i in range(len(possible_states)) if i in repetitions ] ).to(q_values.device) ucb = q_values.reshape(-1) + pi * torch.sqrt( torch.sum(N_s_a) / (1 + N_s_a) ) * (c_1 + torch.log((torch.sum(N_s_a) + c_2 + 1) / c_2)) if return_idx: return ucb.argmax() return possible_states[ucb.argmax()] ``` In practice, the action maximizing the `ucb` function $$ Q(a,s) + \pi(a,s) * \sqrt{\frac{\sum_i{N(s, a_i)}}{1+N(s,a)}} * \left[c_1 + \log\left(\frac{1+c_2+\sum_i{N(s, a_i)}}{c_2}\right)\right] $$ for the given state is selected. Where Q represents the Q values generated by the model and π represents the random distribution over the actions sampled using the model policy. `N(s, a)` represents the number of visits of the node to action a from node s. Once the selection phase reaches a leaf node, if the simulation has not reached a terminal condition (in terms of either maximum exploration, i.e. future horizon, or game ending), the model is then used for selecting `n_samples` alternative nodes (they will be leaf nodes in the successive iteration). This is called the `expansion` phase, since new nodes are added to the tree. Then, no further node is explored in the current simulation, but the leaf q_value is sent to the following simulation step: the `backup`. Backup is the final stage of each simulation. During backup, if the leaf node was a terminal state the final reward is computed else the leaf q value is used as an estimated reward. Then the reward is back-propagated on the simulation trajectory updating both the states q_values and updating the visit counter `N(s, a)`. In the snippet below we show the code for the reward back-propagation. ```python def backward_pass(trajectory, states_dict, leaf_q_value: torch.Tensor): """Backward pass of the montecarlo algorithm""" reward = 0 for idx, (state, action_idx) in enumerate(reversed(trajectory)): if action_idx is None: # leaf node reward += leaf_q_value else: ( _, old_idx_to_new_idx, _, N_s_a, q_values, _, ) = states_dict[state] if isinstance(reward, torch.Tensor): reward = reward.to(q_values.device) action_idx = int(action_idx) if action_idx in old_idx_to_new_idx: not_dupl_index = old_idx_to_new_idx[int(action_idx)] else: not_dupl_index = action_idx reward -= 1 q_values[:, not_dupl_index] = ( N_s_a[:, not_dupl_index] * q_values[:, not_dupl_index] + reward ) / (N_s_a[:, not_dupl_index] + 1) N_s_a[:, not_dupl_index] += 1 ``` ### Improved Policy Computation Once all the simulations have been run and the MCTS offers an interesting snapshot of the near future it is time to update the policy associated with the predicted nodes and return them, so that they can be used during training. The improved policy, following the method described in [Hubert et al](https://arxiv.org/pdf/2104.06303.pdf), is used for managing large action spaces. In fact, for small search space it is possible during MCTS to sample an action randomly from the action space and evaluate its impact. A similar approach in a much larger action space would lead to all trajectories to diverge in different paths and it would need an infinite amount of trajectories for getting meaningful statistics and then update the policy. Since here we are using sample-MCTS for avoiding the dispersion, i.e. `n_samples` actions are sampled accordingly to the model policy and then MCTS just selects one of the sampled actions while exploring the tree, we need to take into account the sample-correction when computing the final updated policy that will be used while training the model. In practice the improved policy is computed as $$ I\pi\left(s, a\right) = \frac{N^{1/\tau(s)}(s, a)}{\sum_iN^{1/\tau(s)}(s, a_i)} $$ where $\tau(s) = \frac{\log\left(\sum_iN(s, a_i)\right)}{\log\left(\bar{N}\right)}$ if $\sum_iN(s, a_i) > \bar{N}$ else $\tau(s) = 1$. ```python def compute_improved_policy( state_dict: Dict, states: List[str], model_n_steps: int, model_n_logits: int, N_bar: int, ): """Compute the improved policy given the state_dict, the list of states. The improved policy is computed as (N_s_a / N_s_a.sum())ˆ(1/tau) where tau is (log(N_s_a.sum()) / log(N_bar)) if N_s_a.sum() > N_bar else 1. """ policies = torch.zeros(len(states), model_n_steps, model_n_logits) N_bar = torch.tensor(N_bar) for idx, state in enumerate(states): N_s_a = state_dict[state][3] actions = state_dict[state][5] if N_s_a.sum() > N_bar: tau = (torch.log(N_s_a.sum()) / torch.log(N_bar)).item() else: tau = 1 N_s_a = N_s_a ** (1 / tau) improved_policy = N_s_a / N_s_a.sum() for sample_id in range(actions.shape[1]): action_ids = actions[0, sample_id] for step_id, action_id in enumerate(action_ids): policies[idx, step_id, action_id] += improved_policy[ 0, sample_id ] return policies ``` Note that in our implementation after having computed the policy from the `N_s_a` tensor we have to map it back to the original action tensor. In fact `N_s_a` just considers the actions sampled by the model, while the final policy must contain probabilities also for the not-explored actions. ### Differences respect to ChatGPT training algorithm AlphaTensor is the latest member of the AlphaGo/AlphaZero family of artificial intelligence methods by DeepMind. These methods are based on the Monte Carlo Tree Search (MCTS) algorithm, which has been refined and enhanced by DeepMind to tackle increasingly complex tasks. Another AI system, OpenAI's ChatGPT, which has caused a lot of buzz for its remarkable performance, was trained with a different approach, called Reinforcement Learning with Human Feedback (RLHF). RLHF is a fine-tuning technique used to tune language models to follow a set of written instructions. It uses human preferences as a reward signal to fine-tune the model, thereby aligning the behavior of the language model with the stated preferences of a specific group of people, rather than some broader notion of ‘human values’. In contrast, MCTS is a tree-based search algorithm used to determine the optimal moves in games. It simulates potential moves and updates the values of each move based on their outcomes, guiding the selection of the best move. RLHF collects data from human-written demonstrations and human-labelled comparisons between AI models, and trains a reward model to predict the preferences of a given group of people. The reward model is then used to fine-tune the AI models. MCTS, on the other hand, uses simulations and evaluations to determine the best decision. Although they are different approaches, RLHF and MCTS also have similarities. Both artificial intelligence techniques use decision-making and problem-solving methods, and both use a trial-and-error approach to explore different options and make decisions based on available information. Both are also iterative processes that improve over time as more information and experience are gathered. The choice between RLHF and MCTS depends on the task at hand. RLHF is ideal when there is no clear metric for evaluating the model performance, while MCTS has proven effective in game-like tasks where knowledge and exploration of the future give the model a significant advantage. ## Code Optimization for AlphaTensor training Implementing the AlphaTensor training algorithm requires finding the perfect compromise between training speed and memory consumption. As seen in the Model section, simply considering the action tokenization can save a lot of memory, but an overly aggressive action space reduction can lead to both drop in accuracy and slower performance. The latter happens because all tokens are generated sequentially in an autoregressive way by the model decoder. Therefore, the inference time grows linearly with the number of tokens per action once the softmax on the action space is not the bottleneck anymore. When setting up AlphaTensor training, the main difficulties were found in dealing with the acting process. If the tensors are not stored in the correct format, the MCTS can easily cause uncontrolled memory usage growth. On the other hand, if the number of tensors stored during each simulation is reduced too much, the MCTS can spend an infinite amount of time re-computing the required states. Let's take an example of the game simulation step, where the game is explored by looking at possible future scenarios. For each state, if we don't save the actions generated by the model and we decide to save only the random seed used to sample the actions from the policy, then each time we explore a tree node we would have to recompute the policy and then sample the actions. Clearly, we decided to store the sampled actions to save time and to avoid having to manage model sharing between different processes in the case of MCTS exploration parallelization. However, just saving the actions was not enough to get a sufficiently efficient acting step. In fact, the time for converting the n_steps actions into the (u, v, w) triplet, reducing the game tensor state and creating the new3D tensors from the n_samples actions would easily be a bottleneck for the whole training. Secondly, we didn't want to store all possible future states for each sampled action, as this would have a huge impact on the memory used by the algorithm. Suppose we set n_samples=32, n=7 and N=5, and let's remember that N is the size of the square matrix product we want to reduce and n is the number of previous actions remembered by the model. In this situation, each state tensor would have the form (8, 25, 25, 25), which multiplied by 32 would result in 32*8*25*25*25*4 bytes for each node in the graph. Now, considering that each simulation in the expansion phase generates a new node (and n_sim=200), we would have a final memory consumption of 200*32*8*25*25*25*4 = 3.2GB for the first MCTS node alone. In the worst case scenario, while exploring acting max_rank nodes (where `max_rank=150`), this would result in a total memory consumption of 150 * 3.2GB = 480GB in RAM memory (or GPU memory if all tensors were stored on the GPU). We ran the training on our workstation with 128 GB of RAM and 48 GB of GPU memory, so we had to reduce the memory consumption. Since we didn't want to increase the execution time, we adopted an optimization that exploits the redundancy in the state tensors produced. In fact, the tensors have n-1 previous actions in common, which can then be stored once and not repeated for each stored tensor. This results in a memory reduction of 2/7~28%, meaning that in the worst case 137GB can be stored. At this point, by simply pruning the unused part of the tree (such as the unselected trajectories) and storing the tensors in CPU memory, we were able to avoid any memory error during training. # Next Steps With AlphaTensor now being open source, several exciting avenues for further development open up. A natural next step is to fine-tune AlphaTensor on specific hardware devices and benchmark performance. At the time of writing, fine-tuning was in progress. Another important advance would be the support for remote compilation, allowing users to build algorithms optimized for edge devices. This can be achieved by storing the AlphaTensor model on a server, while the matrix multiplication algorithm is evaluated on different hardware. It could also be important to extend support for different compilers to compute the latency-based reward correction. Different compilers can lead to different optimized algorithms on a given hardware. For example, the DeepMind paper showed promising results using JAX and the XLA compiler on TPU and Nvidia GPUs. It would be interesting to evaluate this using NCCL on Nvidia or llvm on CPUs. Finally, extending the model and training algorithm to support larger matrix sizes remains a major open challenge. Currently, AlphaTensor supports a maximum matrix size of 5, but it can be applied by splitting larger matrix multiplications into groups of tiny MMs with a size smaller than 5. This approach is suboptimal, and performing the reduction directly on the large tensor corresponding to the full MM could theoretically lead to better results. ## Speedster integration of AlphaTensor AlphaTensor opens the doors for further improvements to Speedster. [Speedster](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster) is an open source module designed to speed up AI inference with just a few lines of code. The library automatically applies the best set of SOTA optimization techniques to achieve maximum inference speed-up. Within Speedster, AlphaTensor will use its optimized kernels for matrix multiplication to find the optimal set of sub-operations for each layer in the AI model that involve matrix multiplication, including linear layers, attention layers, and convolution layers. The matrix multiplications will be decomposed into sub-matrix multiplications up to the maximum size supported by AlphaTensor, and the fastest decomposition will be selected for each layer. This optimization process will be applied to all layers in the neural network, resulting in a dramatically improved model. We expect to see significant speed-ups especially in transformer models, where large matrix multiplications become the computational bottleneck at larger sizes. We also plan to support AlphaTensor algorithm generation for reduced precision formats, such as fp16 and int8, in addition to fp32. ================================================ FILE: optimization/open_alpha_tensor/setup.py ================================================ from pathlib import Path from setuptools import setup, find_packages REQUIREMENTS = [ "nebullvm", "torch", "tqdm", ] this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text(encoding="utf8") setup( name="OpenAlphaTensor", version="0.0.1", packages=find_packages(), install_requires=REQUIREMENTS, long_description=long_description, include_package_data=True, long_description_content_type="text/markdown", ) ================================================ FILE: optimization/optimate/README.md ================================================ # 🧉 OptiMate (WIP) Interactive tool guiding savvy users in achieving the best inference performance out of a given model / hardware setup. If you like this library, give us a star to show your support for the project ⭐ ## 📖 Description The OptiMate module is targeted at a sophisticated and savvy type of users, who need to squeeze out every last drop of performance out of a given hardware. The module is designed to help users to optimize their deep-learning models through the use of profilers and advanced optimization techniques. It also includes a smart assistant that guides the user through the optimization process and provides suggestions to improve the performance of the model. Each temporary optimization is tracked in a detailed version history, allowing the user to revert to its preferred version at the end of the optimization process. First, the module leverages profilers to gather information about the model, such as the amount of time it takes for the model to make predictions and the amount of memory used. This information helps in identifying bottlenecks and other inefficiencies in the model. Then, the module uses various optimization techniques to improve inference performances. These techniques include, among others, model compression, pruning, and quantization, which can help reduce the size and computational demand of the model. Throughout the process, the smart assistant provides guidance and suggestions to the user. For example, it might suggest which optimization techniques to try out or provide guidance on how to adjust the model parameters to improve its performance. Overall, the module provides a user-friendly but sophisticated interface to get the most out of any model / hardware setup. Try it out today, and reach out if you have any feedback! ================================================ FILE: optimization/speedster/README.md ================================================ # 💥 Speedster `Speedster` reduces inference costs by leveraging SOTA optimization techniques that best couple your AI models with the underlying hardware (GPUs and CPUs). The idea is to make AI inference way cheaper in just a few lines of code. `Speedster` makes it easy to combine optimization techniques across the whole software-to-hardware stack, delivering best-in-class cost savings. If you like the idea, give us a star to support the project ⭐ ![speedster](https://user-images.githubusercontent.com/53374883/225599469-f1a626f0-c001-42bd-bc8b-ec0e966ddad6.png) The core `Speedster` workflow consists of 3 steps: - [x] **Select**: input your model in your preferred DL framework and express your preferences regarding: - Accuracy loss: do you want to trade off a little accuracy for significant cost savings? - Optimization time: achieving great savings can be time-consuming. Can you wait, or do you need an instant answer? - [x] **Search**: the library automatically tests every combination of optimization techniques across the software-to-hardware stack (sparsity, quantization, compilers, etc.) that is compatible with your needs and local hardware. - [x] **Serve**: finally, `Speedster` chooses the best configuration of optimization techniques and returns an accelerated version of your model in the DL framework of your choice (just cheaper 🚀). # Installation Install `Speedster` and its base requirements: ``` pip install speedster ``` Then make sure to install all the available deep learning compilers. ``` python -m nebullvm.installers.auto_installer --compilers all ``` > :warning: For **MacOS** with **ARM processors**, please use a conda environment. > Moreover, if you want to optimize a **PyTorch model**, PyTorch must be pre-installed > on your environment before proceeding to the next step, please install it from this > [link](https://pytorch.org/get-started/locally/). For more details on how to install Speedster, please visit our [Installation](https://docs.nebuly.com/Speedster/installation/) guide. # Quick start Only one line of code - that’s what you need to accelerate your model! Find below your getting started guide for 5 different input model frameworks:
🔥 PyTorch In this section, we will learn about the 4 main steps needed to optimize PyTorch models: 1) Input your model and data 2) Run the optimization 3) Save your optimized model 4) Load and run your optimized model in production ```python import torch import torchvision.models as models from speedster import optimize_model, save_model #1 Provide input model and data (we support PyTorch Dataloaders and custom input, see the docs to learn more) model = models.resnet50() input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)] #2 Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) #3 Save the optimized model save_model(optimized_model, "model_save_path") ``` Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice. ```python #4 Load and run your PyTorch accelerated model in production from speedster import load_model optimized_model = load_model("model_save_path") output = optimized_model(input_sample) ``` For more details, please visit [Getting Started with PyTorch Optimization](https://docs.nebuly.com/Speedster/getting_started/pytorch_getting_started/).
🤗 Hugging Face Transformers In this section, we will learn about the 4 main steps needed to optimize 🤗 Hugging Face Transformer models: 1) Input your model and data 2) Run the optimization 3) Save your optimized model 4) Load and run your optimized model in production *
✅ For Decoder-only or Encoder-only architectures (Bert, GPT, etc) ```python from transformers import AlbertModel, AlbertTokenizer from speedster import optimize_model, save_model #1a. Provide input model: Load Albert as an example model = AlbertModel.from_pretrained("albert-base-v1") tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") #1b. Dictionary input format (also string format is accepted, see the docs to learn more) text = "This is an example text for the huggingface model." input_dict = tokenizer(text, return_tensors="pt") input_data = [input_dict for _ in range(100)] #2 Run Speedster optimization (if input data is in string format, also the tokenizer # should be given as input argument, see the docs to learn more) optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) #3 Save the optimized model save_model(optimized_model, "model_save_path") ``` Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice. ```python #4 Load and run your Huggingface accelerated model in production from speedster import load_model optimized_model = load_model("model_save_path") output = optimized_model(**input_sample) ``` For more details, please visit [Getting Started with HuggingFace optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/).
*
✅ For Encoder-Decoder architectures (T5 etc) ```python from transformers import T5Tokenizer, T5ForConditionalGeneration from speedster import optimize_model, save_model #1a. Provide input model: Load T5 as an example model = T5ForConditionalGeneration.from_pretrained("t5-small") tokenizer = T5Tokenizer.from_pretrained("t5-small") #1b. Dictionary input format question = "What's the meaning of life?" answer = "The answer is:" input_dict = tokenizer(question, return_tensors="pt") input_dict["decoder_input_ids"] = tokenizer(answer, return_tensors="pt").input_ids input_data = [input_dict for _ in range(100)] #2 Run Speedster optimization (if input data is in string format, also the tokenizer # should be given as input argument, see the docs to learn more) optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) #3 Save the optimized model save_model(optimized_model, "model_save_path") ``` Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice. ```python #4 Load and run your Huggingface accelerated model in production from speedster import load_model optimized_model = load_model("model_save_path") output = optimized_model(**input_sample) ``` For more details, please visit [Getting Started with HuggingFace optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/).
🧨 Hugging Face Diffusers > :warning: In order to work properly, the diffusers optimization requires `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`. For additional details, please look the docs [here](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/). In this section, we will learn about the 4 main steps needed to optimize Stable Diffusion models from the Diffusers library: 1) Input your model and data 2) Run the optimization 3) Save your optimized model 4) Load and run your optimized model in production ```python import torch from diffusers import StableDiffusionPipeline from speedster import optimize_model, save_model #1 Provide input model and data model_id = "CompVis/stable-diffusion-v1-4" device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cuda": # On GPU we load by default the model in half precision, because it's faster and lighter. pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16) else: pipe = StableDiffusionPipeline.from_pretrained(model_id) # Create some example input data input_data = [ "a photo of an astronaut riding a horse on mars", "a monkey eating a banana in a forest", "white car on a road surrounded by palm trees", "a fridge full of bottles of beer", "madara uchiha throwing asteroids against people" ] #2 Run Speedster optimization optimized_model = optimize_model( model=pipe, input_data=input_data, optimization_time="unconstrained", ignore_compilers=["torch_tensor_rt", "tvm"], metric_drop_ths=0.1, ) #3 Save the optimized model save_model(optimized_model, "model_save_path") ``` Once the optimization is completed, start using the accelerated model (on steroids 🚀). ```python #4 Load and run your PyTorch accelerated model in production from speedster import load_model optimized_model = load_model("model_save_path", pipe=pipe) test_prompt = "futuristic llama with a cyberpunk city on the background" output = optimized_model(test_prompt).images[0] ``` For more details, please visit [Getting Started with Stable Diffusion optimization](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).
🌊 TensorFlow/Keras In this section, we will learn about the 4 main steps needed to optimize TensorFlow/Keras models: 1) Input your model and data 2) Run the optimization 3) Save your optimized model 4) Load and run your optimized model in production ```python import tensorflow as tf from tensorflow.keras.applications.resnet50 import ResNet50 from speedster import optimize_model, save_model #1 Provide input model and data (we support Keras dataset and custom input, see the docs to learn more) model = ResNet50() input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for _ in range(100)] #2 Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) #3 Save the optimized model save_model(optimized_model, "model_save_path") ``` Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice. ```python #4 Load and run your TensorFlow accelerated model in production from speedster import load_model optimized_model = load_model("model_save_path") output = optimized_model(input_sample) ``` For more details, please visit [Getting Started with TensorFlow optimization](https://docs.nebuly.com/Speedster/getting_started/tf_getting_started/).
⚡ ONNX In this section, we will learn about the 4 main steps needed to optimize ONNX models: 1) Input your model and data 2) Run the optimization 3) Save your optimized model 4) Load and run your optimized model in production ```python import numpy as np from speedster import optimize_model, save_model #1 Provide input model and data # Model was downloaded from here: # https://github.com/onnx/models/tree/main/vision/classification/resnet model = "resnet50-v1-12.onnx" input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for _ in range(100)] #2 Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) #3 Save the optimized model save_model(optimized_model, "model_save_path") ``` Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice. ```python #4 Load and run your ONNX accelerated model in production from speedster import load_model optimized_model = load_model("model_save_path") output = optimized_model(input_sample) ``` For more details, please visit [Getting Started with ONNX optimization](https://docs.nebuly.com/Speedster/getting_started/onnx_getting_started/).
# **Documentation** - [Installation](https://docs.nebuly.com/Speedster/installation/) - [Getting started with PyTorch optimization](https://docs.nebuly.com/Speedster/getting_started/pytorch_getting_started/) - [Getting started with Hugging Face optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/) - [Getting started with Stable Diffusion optimization](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/) - [Getting started with TensorFlow optimization](https://docs.nebuly.com/Speedster/getting_started/tf_getting_started/) - [Getting started with ONNX optimization](https://docs.nebuly.com/Speedster/getting_started/onnx_getting_started/) - [Key concepts](https://docs.nebuly.com/Speedster/key_concepts/) - [Notebooks](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) - [Advanced options](https://docs.nebuly.com/Speedster/advanced_options/) - [Benchmarks](https://docs.nebuly.com/Speedster/benchmarks/) # **Key concepts** Speedster's design reflects our mission to automatically master each and every existing AI acceleration technique to deliver the most cost-efficient AI ever. As a result, `Speedster` leverages available enterprise-grade open-source optimization tools. If these tools and communities already exist, and are distributed under a permissive license (Apache, MIT, etc), we integrate them and happily contribute to their communities. However, many tools do not exist yet, in which case we implement them and open-source the code so that our community can benefit from it. `Speedster` is shaped around **4 building blocks** and leverages a modular design to foster scalability and integration of new acceleration components across the software to hardware stack. - [x] **Converter:** converts the input model from its original framework to the framework backends supported by `Speedster`, namely PyTorch, ONNX and TensorFlow. This allows the Compressor and Compiler modules to apply any optimization technique to the model. - [x] **Compressor:** applies various compression techniques to the model, such as pruning, knowledge distillation, or quantization-aware training. - [x] **Compiler:** converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The compilers apply both post-training quantization techniques and graph optimizations, to produce compiled binary files. - [x] **Inference Learner:** takes the best performing compiled model and converts it back into the same interface as the original input model. ![speedster_blocks](https://user-images.githubusercontent.com/42771598/213177175-a76908a2-5eef-4e82-9d54-0fc812131463.png) The **compressor** stage leverages the following open-source projects: - [Intel/neural-compressor](https://github.com/intel/neural-compressor): targeting to provide unified APIs for network compression technologies, such as low precision quantization, sparsity, pruning, knowledge distillation, across different deep learning frameworks to pursue optimal inference performance. - [SparseML](https://github.com/neuralmagic/sparseml): libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models. The **compiler stage** leverages the following open-source projects: - [Apache TVM](https://github.com/apache/tvm): open deep learning compiler stack for cpu, gpu and specialized accelerators. - [BladeDISC](https://github.com/alibaba/BladeDISC): end-to-end Dynamic Shape Compiler project for machine learning workloads. - [DeepSparse](https://github.com/neuralmagic/deepsparse): neural network inference engine that delivers GPU-class performance for sparsified models on CPUs. - [OpenVINO](https://github.com/openvinotoolkit/openvino): open-source toolkit for optimizing and deploying AI inference. - [ONNX Runtime](https://github.com/microsoft/onnxruntime): cross-platform, high performance ML inferencing and training accelerator - [TensorRT](https://github.com/NVIDIA/TensorRT): C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators. - [TFlite](https://github.com/tensorflow/tflite-micro) and [XLA](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): open-source libraries to accelerate TensorFlow models. # **Community** We’re developing `Speedster` for and together with our community, so please get in touch on GitHub or Discord. • **[GitHub issues](https://github.com/nebuly-ai/nebullvm/issues)**: suggest new acceleration components, request new features, and report bugs and improvements. • **[Discord](https://discord.gg/RbeQMu886J)**: learn about AI acceleration, share exciting projects and hang out with our global community. The best way to get started is to pick a good-first issue. Please read our [contribution guidelines](https://docs.nebuly.com/contributions/) for a deep dive into how to best contribute to our project! Don't forget to leave a star ⭐ to support the project and happy acceleration 🚀 ================================================ FILE: optimization/speedster/docs/en/docs/advanced_options.md ================================================ # Advanced options If you’re new to the library, you may want to start with the **Getting started** section. The user guide here shows more advanced workflows and how to use the library in different ways. We are going to show some examples of more advanced usages of `Speedster`, that we hope will give you a deeper insight of how `Speedster` works. In particular, we will overview: - [`optimize_model`](#optimizemodel-api) API - [Acceleration suggestions](#acceleration-suggestions) - [Selecting which device](#selecting-which-device-to-use--cpu-gpu-and-other-accelerators) to use: CPU, GPU and other accelerators - [Optimization Time: constrained vs unconstrained](#optimization-time--constrained-vs-unconstrained) - [Selecting specific compilers/compressors](#select-specific-compilerscompressors) - [Using dynamic shape](#using-dynamic-shape) - [Enable TensorrtExecutionProvider for ONNXRuntime on GPU](#enable-tensorrtexecutionprovider-for-onnxruntime-on-gpu) - [Custom models](#custom-models) - [Store the performances of all the optimization techniques](#store-the-performances-of-all-the-optimization-techniques) - [Set number of threads](#set-number-of-threads) ## `optimize_model` API The `optimize_model` function allows to optimize a model from one of the supported frameworks (PyTorch, HuggingFace, TensorFlow, ONNX), and returns an optimized model that can be used with the same interface as the original model. ```python def optimize_model( model: Any, input_data: Union[Iterable, Sequence], metric_drop_ths: Optional[float] = None, metric: Union[str, (...) -> Any, None] = None, optimization_time: str = "constrained", dynamic_info: Optional[dict] = None, config_file: Optional[str] = None, ignore_compilers: Optional[List[str]] = None, ignore_compressors: Optional[List[str]] = None, store_latencies: bool = False, device: str = None, **kwargs: Any ) -> Any ``` **Arguments** `model`: Any The input model can belong to one of the following frameworks: PyTorch, TensorFlow, ONNX, HuggingFace. In the ONNX case, `model` is a string with the path to the saved onnx model. In the other cases, it is a torch.nn.Module or a tf.Module. `input_data`: Iterable or Sequence Input data needed to test the optimization performances (latency, throughput, accuracy loss, etc). It can consist of one or more data samples. Note that if `optimization_time` is set to "unconstrained," it would be preferable to provide at least 100 data samples to also activate `Speedster` techniques that require more data (pruning, etc.). See the Getting started section to learn more about the `input_data` depending on your input framework: - [Getting started with PyTorch optimization](getting_started/pytorch_getting_started.md#1-input-model-and-data) - [Getting started with 🤗 HuggingFace optimization](getting_started/hf_getting_started.md#1-input-model-and-data) - [Getting started with Stable Diffusion optimization](getting_started/diffusers_getting_started.md#1-input-model-and-data) - [Getting started with TensorFlow/Keras optimization](getting_started/tf_getting_started.md#1-input-model-and-data) - [Getting started with ONNX optimization](getting_started/onnx_getting_started.md#1-input-model-and-data) `metric_drop_ths`: float, optional Maximum drop in your preferred metric (see "metric" section below). All the optimized models having a larger error with respect to the `metric_drop_ths` will be discarded. Default: 0. `metric`: Callable, optional Metric to be used for estimating the error that may arise from using optimization techniques and for evaluating if the error exceeds the `metric_drop_ths`. `metric` accepts as input a string, a user-defined metric, or None. Metric accepts a string containing the name of the metric; it currently supports: - "numeric_precision" - "accuracy". - user-defined metric: function that takes as input the output of the original model and the one of the optimized model, and, if available, the original label. The function calculates and returns the reduction in the metric due to the optimization. Default: "numeric_precision". `optimization_time`: OptimizationTime, optional The optimization time mode. It can be "constrained" or "unconstrained". In "constrained" mode, Speedster takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation. Note that most techniques activated in "unconstrained" mode require fine-tuning, and therefore it is recommended to provide at least 100 samples as input_data. Default: "constrained". `dynamic_info`: Dict, optional Dictionary containing dynamic axis information. It should contain as keys both "input" and "output" and as values two lists of dictionaries, where each dictionary represents dynamic axis information for an input/output tensor. The inner dictionary should have an integer as a key, i.e. the dynamic axis (also considering the batch size) and a string as a value giving it a tag, e.g., "batch_size.". Default: None. `config_file`: str, optional Configuration file containing the parameters needed to define the CompressionStep in the pipeline. Default: None. `ignore_compilers`: List[str], optional List of DL compilers ignored during optimization execution. The compiler name should be one among tvm, tensor RT, openvino, onnxruntime, deepsparse, tflite, bladedisc, torchscript, intel_neural_compressor . Default: None. `ignore_compressors`: List[str], optional List of DL compressors ignored during the compression stage. The compressor name should be one among sparseml and intel_pruning. Default: None. `store_latencies`: bool, optional Parameter that allows to store the latency for each compiler used by Speedster in a json file. The JSON is created in the working directory. Default: False. `device`: str, optional Device used for inference, it can be cpu or gpu/cuda (both gpu and cuda options are supported). A specific gpu can be selected using notation gpu:1 or cuda:1. gpu will be used if available, otherwise cpu. Default: None. **Returns: Inference Learner** Optimized version with the same interface of the input model. For example, optimizing a PyTorch model will return an InferenceLearner object that can be called exactly like a PyTorch model (either with model.forward(input) or model(input)). The optimized model will therefore take as input a torch.Tensors and return a torch.Tensors. ## Acceleration suggestions If the speedup you obtained with the first optimization with `Speedster` is not enough, we suggest the following actions: - Include more backends for optimization, i.e. set `--backend all` - Increase the `metric_drop_ths` by 5%, if possible: see [Optimize_model API](#optimize_model-api) - Verify that your device is supported by your version of speedster: see [Supported hardware](hardware.md) - Try to accelerate your model on a different hardware or consider using the CloudSurfer module to automatically understand which is the best hardware for your model: see [CloudSurfer](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/cloud_surfer) module. ## Selecting which device to use: CPU, GPU and other accelerators. Speedster currently supports the following devices: `CPUs`, `GPUs`, `TPUs` and `AWS Inferentia chips`. The parameter `device` allows to select which device we want to use for inference. By default, `Speedster` will use the accelerator if available on the machine, otherwise it will use cpu. If we are running on a machine with an available accelerator and we want to optimize the model for cpu inference, we can use: ```python from speedster import optimize_model optimized_model = optimize_model( model, input_data=input_data, device="cpu" ) ``` If we are working on a multi-gpu machine and we want to use a specific gpu, we can use: ```python from speedster import optimize_model optimized_model = optimize_model( model, input_data=input_data, device="cuda:1" # also device="gpu:1" is supported ) ``` The same applies also for TPUs and AWS Inferentia chips: ```python from speedster import optimize_model optimized_model = optimize_model( model, input_data=input_data, device="tpu:1" # use tpu #1 ) optimized_model = optimize_model( model, input_data=input_data, device="neuron:1" # use Inferentia chip #1 ) ``` ## Optimization Time: constrained vs unconstrained One of the first options that can be customized in `Speedster` is the `optimization_time` parameter. In order to optimize the model, `Speedster` will try a list of compilers which allow to keep the same accuracy of the original model. In addition to compilers, it can also use other techniques such as pruning, quantization, and other compression techniques which can lead to a little drop in accuracy and may require some time to complete. We defined two scenarios: - **constrained**: only compilers and precision reduction techniques are used, so the compression step (the most time consuming one) is skipped. Moreover, in some cases the same compiler could be available for more than one pipeline, for example tensor RT is available both with PyTorch and ONNX backends. In the constrained scenario, each compiler will be used only once, so if for example we optimize a PyTorch model and tensor RT in the PyTorch pipeline manages to optimize the model, it won't be used again in the ONNX pipeline. - **unconstrained**: in this scenario, `Speedster` will use all the compilers available, even if they appear in more than one backend. It also allows the usage of more time consuming techniques such as pruning and distillation. Note that for using many of the sophisticated techniques in the 'unconstrained' optimization, a small fine-tuning of the model will be needed. Thus, we highly recommend to provide as input_data at least 100 samples when selecting 'unconstrained' optimization. ## Select specific compilers/compressors The `optimize_model` functions accepts also the parameters `ignore_compilers` and `ignore_compressors`, which allow to skip specific compilers or compressors. The full list of available options is the following: - _ignore_compilers_: `deepsparse`, `tensor_rt`, `torch_tensor_rt`, `onnx_tensor_rt`, `torchscript`, `onnxruntime`, `tflite`, `tvm`, `onnx_tvm`, `torch_tvm`, `bladedisc`, `openvino`, `intel_neural_compressor`, `torch_xla`, `torch_neuron`. - _ignore_compressors_: `sparseml`, `intel_pruning`. Some compilers, such as tensor RT, are available for both PyTorch and ONNX backends. For this reason in the list of compilers we have `tensor_rt` which skips both the PyTorch and ONNX pipelines, and `torch_tensor_rt` and `onnx_tensor_rt` which skip only the PyTorch and ONNX pipelines respectively. If we want to skip the `tvm` and `bladedisc` optimizers, we could write: ```python from speedster import optimize_model optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=["tvm", "bladedisc"] ) ``` ## Using dynamic shape By default, a model optimized with `Speedster` will have a static shape. This means that it can be used in inference only with the same shape of the inputs provided to the `optimize_model` function during the optimization. The dynamic shape however is fully supported, and can be enabled with the `dynamic_info` parameter (see the [optimize_model API](#optimize_model-api) arguments to see how this parameter is defined.) For each dynamic axis in the inputs, we need to provide the following information: - the axis number (starting from 0, considering the batch size as the first axis) - a tag that will be used to identify the axis - the minimum, optimal and maximum sizes of the axis (some compilers will work also for shapes that are not in the range [min, max], but the performance may be worse) Let's see an example of a model that takes two inputs, where the batch size must be dynamic, as well as the size on the third and fourth dimensions. ```python import torch import torchvision.models as models from speedster import optimize_model # Load a resnet as example model = models.resnet50() # Provide an input data for the model input_data = [((torch.randn(1, 3, 256, 256),), torch.tensor([0])) for _ in range(100)] # Set dynamic info dynamic_info = { "inputs": [ { 0: { "name": "batch", "min_val": 1, "opt_val": 1, "max_val": 8, }, 2: { "name": "dim_image", "min_val": 128, "opt_val": 256, "max_val": 512, }, 3: { "name": "dim_image", "min_val": 128, "opt_val": 256, "max_val": 512, }, } ], "outputs": [ {0: "batch", 1: "out_dim"} ] } # Run Speedster optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", dynamic_info=dynamic_info ) ``` ## Enable TensorrtExecutionProvider for ONNXRuntime on GPU By default, `Speedster` will use the `CUDAExecutionProvider` for ONNXRuntime on GPU. If you want to use the `TensorrtExecutionProvider` instead, you must add the TensorRT installation path to the env variable LD_LIBRARY_PATH. If you installed TensorRT through the nebullvm auto_installer, you can do it by running the following command in the terminal: ```bash export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"//site-packages/tensorrt" ``` ## Custom models `Speedster` is designed to optimize models that take as inputs and return in output only tensors or np.ndarrays (and dictionaries/strings for huggingface). Some models may require instead a custom input, for example a dictionary where the keys are the names of the inputs and the values are the input tensors, or may return a dictionary as output. We can optimize such models with `Speedster` by defining a model wrapper. Let's take the example of the detectron2 model which takes as input a tuple of tensors but returns a dictionary as output: ```python class BaseModelWrapper(torch.nn.Module): def __init__(self, core_model, output_dict): super().__init__() self.core_model = core_model self.output_names = [key for key in output_dict.keys()] def forward(self, *args, **kwargs): res = self.core_model(*args, **kwargs) return tuple(res[key] for key in self.output_names) class OptimizedWrapper(torch.nn.Module): def __init__(self, optimized_model, output_keys): super().__init__() self.optimized_model = optimized_model self.output_keys = output_keys def forward(self, *args): res = self.optimized_model(*args) return {key: value for key, value in zip(self.output_keys, res)} input_data = [((torch.randn(1, 3, 256, 256)), torch.tensor([0]))] # Compute the original output of the model (in dict format) res = model_backbone(torch.randn(1, 3, 256, 256)) # Pass the model and the output sample to the wrapper backbone_wrapper = BaseModelWrapper(model_backbone, res) # Optimize the model wrapper optimized_model = optimize_model(backbone_wrapper, input_data=input_data) # Wrap the optimized model with a new wrapper to restore the original model output format optimized_backbone = OptimizedWrapper(optimized_model, backbone_wrapper.output_names) ``` You can find other examples in the [notebooks](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) section available on GitHub. ## Store the performances of all the optimization techniques `Speedster` internally tries all the techniques available on the target hardware and automatically chooses the fastest one. If you need more details on the inference times of each compiler, you can set the `store_latencies` parameter to `True`. A json file will be created in the working directory, listing all the results of the applied techniques and of the original model itself. ```python # Run Speedster optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, store_latencies=True ) ``` ## Set number of threads When running multiple replicas of the model in parallel, it would be useful for CPU-optimized algorithms to limit the number of threads to use for each model. In `Speedster`, it is possible to set the maximum number of threads a single model can use with the environment variable `NEBULLVM_THREADS_PER_MODEL`. For instance, you can run: ```python export NEBULLVM_THREADS_PER_MODEL = 2 ``` for using just two CPU threads per model at inference time and during optimization. ================================================ FILE: optimization/speedster/docs/en/docs/benchmarks.md ================================================ # Benchmarks !!! info In this section you are going to learn how `Speedster` accelerates the inference of various models on different hardware architecture. Here we provide a preview of the following accelerated models: - [Bert](#bert) - [YoloV5](#yolov5) - [EfficientNet](#efficientnet) - [GPT2](#gpt2) - [ResNet](#resnet) - [Roberta](#roberta) The above models are tested on very popular hardware architecture and instances: - AWS - c5n,2xlarge - AWS - c5,12xlarge - AWS - c6i.12xlarge - AWS - m6i,24xlarge - NVIDIA T4 - NVIDIA V100 - NVIDIA 3090 ## Bert ![bert](images/bert.png) ## YoloV5 ![yolo](images/yolov5.png) ## EfficientNet ![yolo](images/efficientnet.png) ## GPT2 ![yolo](images/gpt2.png) ## ResNet ![yolo](images/resnet.png) ## Roberta ![yolo](images/roberta.png) ================================================ FILE: optimization/speedster/docs/en/docs/getting_started/diffusers_getting_started.md ================================================ # Getting started with Stable Diffusion optimization In this section, we will learn about the 4 main steps needed to optimize Stable Diffusion models from the `Diffusers` library: 1. [Environment Setup](#1-input-model-and-data) 2. [Input your model and data](#2-input-model-and-data) 3. [Run the optimization](#3-run-the-optimization) 4. [Save your optimized model](#4-save-your-optimized-model) 5. [Load and run your optimized model in production](#5-load-and-run-your-optimized-model-in-production) ## 1) Environment Setup (GPU only) In order to optimize a Stable Diffusion model, you have to ensure that your environment is correctly set up according to these requirements: `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`. From TensorRT 8.6, all the tensorrt pre-built wheels released by nvidia support only `CUDA>=12.0`. Speedster will install `tensorrt>=8.6.0` automatically in the auto-installer only if it detects CUDA>=12.0, otherwise it will install `tensorrt==8.5.3.1`. In that case, you will have to upgrade your CUDA version and then to upgarde tensorrt to 8.6.0 or above. There should be a way to run TensorRT 8.6 also with CUDA 11, but it requires installing TensorRT in a different way, you can check this issue: https://github.com/NVIDIA/TensorRT/issues/2773. Otherwise, we highly suggest to just upgrade to CUDA 12. For now PyTorch>=2.0.0 is not supported due to an [issue](https://github.com/pytorch/pytorch/issues/97262) in the conversion to onnx, so until they fix it you must have torch<=1.13.1 to optimize Stable Diffusion successfully. You can check your CUDA version with the following command: ```bash nvidia-smi ``` If you have CUDA<12.0, you can upgrade it at this link: https://developer.nvidia.com/cuda-downloads You can check your TensorRT version with the following command: ```bash python -c "import tensorrt; print(tensorrt.__version__)" ``` If you have an older version, after ensuring you have `CUDA>=12.0` installed, you can upgrade your TensorRT version by running: ``` pip install -U tensorrt ``` You can finally check your PyTorch version with the command ```bash python -c "import torch; print(torch.__version__)" ``` If you have torch>=2.0.0, you can downgrade it by running: ``` pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 ``` ## 2) Input model and data !!! info In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). For Stable Diffusion models Speedster expects the input data to be a list of sentences: ```List[str]``` ```python import torch from speedster import optimize_model from diffusers import StableDiffusionPipeline # Load Stable Diffusion 1.4 as example model_id = "CompVis/stable-diffusion-v1-4" device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cuda": # On GPU we load by default the model in half precision, because it's faster and lighter. pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16) else: pipe = StableDiffusionPipeline.from_pretrained(model_id) # Create some example input data input_data = [ "a photo of an astronaut riding a horse on mars", "a monkey eating a banana in a forest", "white car on a road surrounded by palm trees", "a fridge full of bottles of beer", "madara uchiha throwing asteroids against people" ] ``` Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀. ## 3) Run the optimization Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. The function takes the following arguments as inputs: - `model`: model to be optimized in your preferred framework (A Diffusers pipe in this case) - `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc) - `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation - `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration and returns the accelerated version of your model 🚀. ``` python from speedster import optimize_model # Run Speedster optimization optimized_model = optimize_model( pipe, input_data=input_data, optimization_time="unconstrained", metric_drop_ths=0.05 ) ``` Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware. At the end of the optimization, you are going to see the results in a summary table like the following: ![pt](../images/stable_diffusion.png) If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production. If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly. ## 4) Save your optimized model After accelerating the model, it can be saved using the `save_model` function: ```python from speedster import save_model save_model(optimized_model, "model_save_path") ``` Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section. ## 5) Load and run your optimized model in production Once the optimized model has been saved, it can be loaded with the `load_model` function: ```python from speedster import load_model optimized_model = load_model("model_save_path", pipe=pipe) ``` In this case we must provide also the original pipe as argument to the load_function, Speedster will automatically load the optimized model and replace the original UNet inside the pipe. The optimized model can be used for accelerated inference in the same way as the original model: ```python # Use the accelerated version of your Stable Diffusion model in production output = optimized_model(test_prompt).images[0] ``` !!! info The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever. If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section. ================================================ FILE: optimization/speedster/docs/en/docs/getting_started/hf_getting_started.md ================================================ # Getting started with HuggingFace optimization In this section, we will learn about the 4 main steps needed to optimize your 🤗 HuggingFace models: 1. [Input your model and data](#1-input-model-and-data) 2. [Run the optimization](#2-run-the-optimization) 3. [Save your optimized model](#3-save-your-optimized-model) 4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) ## 1) Input model and data !!! info In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). For HuggingFace models we support different types of input data depending on the architecture of your input model. - [x] For Decoder-only or Encoder-only architectures (Bert, GPT, etc), we support: - Dictionary - String - [x] For Encoder-Decoder architectures (T5 etc), we support: - Dictionary === "Decoder-only or Encoder-only (Bert, GPT, etc)" **Input as Dictionary** ```python from transformers import AlbertModel, AlbertTokenizer # Load Albert as example model = AlbertModel.from_pretrained("albert-base-v1") tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") # Case 1: dictionary input format text = "This is an example text for the huggingface model." input_dict = tokenizer(text, return_tensors="pt") input_data = [input_dict for _ in range(100)] ``` Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀. **Input as String** In the string case, the HuggingFace tokenizer must be given as input to the `optimize_model` in addition to the `input_data`, and the arguments for the tokenizer can be passed using the param `tokenizer_args`. ```python from transformers import AlbertModel, AlbertTokenizer # Load Albert as example model = AlbertModel.from_pretrained("albert-base-v1") tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") # Case 2: strings input format input_data = [ "This is a test.", "Hi my name is John.", "The cat is on the table.", ] tokenizer_args = dict( return_tensors="pt", padding="longest", truncation=True, ) ``` Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀. === "Encoder-Decoder architectures (T5 etc)" For encoder-decoder architectures we support only `input_data` as Dictionary: ```python from transformers import T5Tokenizer, T5ForConditionalGeneration # Load T5 as example model = T5ForConditionalGeneration.from_pretrained("t5-small") tokenizer = T5Tokenizer.from_pretrained("t5-small") # Case 1: dictionary input format question = "What's the meaning of life?" answer = "The answer is:" input_dict = tokenizer(question, return_tensors="pt") input_dict["decoder_input_ids"] = tokenizer(answer, return_tensors="pt").input_ids input_data = [input_dict for _ in range(100)] ``` Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀. ## 2) Run the optimization Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. The function takes the following arguments as inputs: - `model`: model to be optimized in your preferred framework (HuggingFace in this case) - `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc) - `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation - `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration and returns the accelerated version of your model 🚀. Depending on the format of your `input_data`, the `optimize_model` is as follows: === "Input as Dictionary" ```python from speedster import optimize_model # Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) ``` === "Input as String" ```python from speedster import optimize_model # Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05, tokenizer=tokenizer, tokenizer_args={"return_tensors": "pt"} ) ``` Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware. At the end of the optimization, you are going to see the results in a summary table like the following: ![pt](../images/pt_table.png) If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production. If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly. ## 3) Save your optimized model After accelerating the model, it can be saved using the `save_model` function: ```python from speedster import save_model save_model(optimized_model, "model_save_path") ``` Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section. ## 4) Load and run your optimized model in production Once the optimized model has been saved, it can be loaded with the `load_model` function: ```python from speedster import load_model optimized_model = load_model("model_save_path") ``` The optimized model can be used for accelerated inference in the same way as the original model: ```python # Use the accelerated version of your HuggingFace model in production output = optimized_model(**input_sample) ``` !!! info The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever. If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section. ================================================ FILE: optimization/speedster/docs/en/docs/getting_started/onnx_getting_started.md ================================================ # Getting started with ONNX optimization In this section, we will learn about the 4 main steps needed to optimize your ONNX models: 1. [Input your model and data](#1-input-model-and-data) 2. [Run the optimization](#2-run-the-optimization) 3. [Save your optimized model](#3-save-your-optimized-model) 4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) ## 1) Input model and data !!! info In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). ```python import numpy as np # Load a resnet as example # Model was downloaded from here: # https://github.com/onnx/models/tree/main/vision/classification/resnet model = "resnet50-v1-12.onnx" # Provide input data for the model input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for _ in range(100)] ``` Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀. ## 2) Run the optimization Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. The function takes the following arguments as inputs: - `model`: model to be optimized in your preferred framework (ONNX in this case) - `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc) - `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation - `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration and returns the accelerated version of your model 🚀. ``` python from speedster import optimize_model # Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) ``` Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware. At the end of the optimization, you are going to see the results in a summary table like the following: ![pt](../images/pt_table.png) If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production. If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly. ## 3) Save your optimized model After accelerating the model, it can be saved using the `save_model` function: ```python from speedster import save_model save_model(optimized_model, "model_save_path") ``` Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section. ## 4) Load and run your optimized model in production Once the optimized model has been saved, it can be loaded with the `load_model` function: ```python from speedster import load_model optimized_model = load_model("model_save_path") ``` The optimized model can be used for accelerated inference in the same way as the original model: ```python # Use the accelerated version of your ONNX model in production output = optimized_model(input_sample) ``` !!! info The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever. If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section. ================================================ FILE: optimization/speedster/docs/en/docs/getting_started/pytorch_getting_started.md ================================================ # Getting started with PyTorch optimization In this section, we will learn about the 4 main steps needed to optimize PyTorch models: 1. [Input your model and data](#1-input-model-and-data) 2. [Run the optimization](#2-run-the-optimization) 3. [Save your optimized model](#3-save-your-optimized-model) 4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) ## 1) Input model and data !!! info In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). For PyTorch models we support two types of input data: * Custom data format * PyTorch DataLoader === "Custom Data Format" Input data is a ```List[Tuple[Tuple[tensor, ...], tensor]]``` - Each element of the list is a tuple, which represents a batch of the dataset. - In each tuple, the first element is another tuple containing a value for each input tensor of the model, while the second element is a tensor containing the labels of that batch of data. The label is optional, so it can be omitted. ``` python import torch import torchvision.models as models # Load a resnet as example model = models.resnet50() # Provide input data for the model input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)] ``` See below further examples with custom format: ``` python # Dataset for a model that takes 1 input, containing 100 batches of data with bs=1 with labels input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)] # Dataset for a model that takes 2 inputs, containing 100 batches of data with bs=5 with labels input_data = [((torch.randn(5, 3, 256, 256), torch.randn(5, 3, 256, 256), ), torch.tensor([0, 1, 0, 1, 1])) for _ in range(100)] # Dataset for a model that takes 1 input, containing 100 batches of data with bs=1 without labels input_data = [((torch.randn(1, 3, 256, 256), ), ) for _ in range(100)] ``` Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀. === "PyTorch DataLoader" We support the following DataLoader types: * Tensor only * Tensor and labels For models with multiple inputs, we support the following types: - input_1, input_2, ..., input_n, label - (input_1, input_2, ..., input_n), label ```python import torch import torchvision.models as models # Load a resnet as example model = models.resnet50() # Use your PyTorch DataLoader in any of the standard format input_data = ``` Now your input `model` and `input_data` are ready, you can move on to the [Run the optimization](#2-run-the-optimization) section. ## 2) Run the optimization Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. The function takes the following arguments as inputs: - `model`: model to be optimized in your preferred framework (PyTorch in this case) - `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc) - `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation - `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration and returns the accelerated version of your model 🚀. ``` python from speedster import optimize_model # Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) ``` Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware. At the end of the optimization, you are going to see the results in a summary table like the following: ![pt](../images/pt_table.png) If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production. If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly. ## 3) Save your optimized model After accelerating the model, it can be saved using the `save_model` function: ```python from speedster import save_model save_model(optimized_model, "model_save_path") ``` Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section. ## 4) Load and run your optimized model in production Once the optimized model has been saved, it can be loaded with the `load_model` function: ```python from speedster import load_model optimized_model = load_model("model_save_path") ``` The optimized model can be used for accelerated inference in the same way as the original model: ```python # Use the accelerated version of your PyTorch model in production output = optimized_model(input_sample) ``` !!! info The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever. If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section. ================================================ FILE: optimization/speedster/docs/en/docs/getting_started/tf_getting_started.md ================================================ # Getting started with TensorFlow optimization In this section, we will learn about the 4 main steps needed to optimize TensorFlow models: 1. [Input your model and data](#1-input-model-and-data) 2. [Run the optimization](#2-run-the-optimization) 3. [Save your optimized model](#3-save-your-optimized-model) 4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) ## 1) Input model and data !!! info In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). For TensorFlow models we support two types of input data: * Custom data format * TensorFlow DataLoader === "Custom Data Format" Input data is a ```List[Tuple[Tuple[tensor, ...], tensor]]``` - Each element of the list is a tuple, which represents a batch of the dataset. - In each tuple, the first element is another tuple containing a value for each input tensor of the model, while the second element is a tensor containing the labels of that batch of data. The label is optional, so it can be omitted. ``` python import tensorflow as tf from tensorflow.keras.applications.resnet50 import ResNet50 # Load a resnet as example model = ResNet50() # Provide input data for the model input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for _ in range(100)] ``` Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀. === "TensorFlow DataLoader" We support the following DataLoader types: * Tensor only * Tensor and labels For models with multiple inputs, we support the following types: - input_1, input_2, ..., input_n, label - (input_1, input_2, ..., input_n), label ```python import torch import torchvision.models as models # Load a resnet as example model = models.resnet50() # Use your TensorFlow DataLoader in any of the standard format input_data = ``` Now your input `model` and `input_data` are ready, you can move on to the [Run the optimization](#2-run-the-optimization) section. ## 2) Run the optimization Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. The function takes the following arguments as inputs: - `model`: model to be optimized in your preferred framework (TensorFlow in this case) - `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc) - `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation - `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration and returns the accelerated version of your model 🚀. ``` python from speedster import optimize_model # Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="constrained", metric_drop_ths=0.05 ) ``` Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware. At the end of the optimization, you are going to see the results in a summary table like the following: ![pt](../images/hf_table.png) If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production. If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly. ## 3) Save your optimized model After accelerating the model, it can be saved using the `save_model` function: ```python from speedster import save_model save_model(optimized_model, "model_save_path") ``` Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section. ## 4) Load and run your optimized model in production Once the optimized model has been saved, it can be loaded with the `load_model` function: ```python from speedster import load_model optimized_model = load_model("model_save_path") ``` The optimized model can be used for accelerated inference in the same way as the original model: ```python # Use the accelerated version of your TensorFlow model in production output = optimized_model(input_sample) ``` !!! info The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever. If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section. ================================================ FILE: optimization/speedster/docs/en/docs/hardware.md ================================================ # Supported hardware `Speedster` has been mostly tested on Nvidia GPUs and Intel/AMD CPUs. The library may also work with other hardware on which has not been tested. Please let us know if you find out that `Speedster` works well on other hardware or if you find issues. Fully supported hardware: - Intel CPU - Nvidia GPU Hardware we are currently integrating: - Apple M1 - AMD CPU - Intel GPU (open issue 👩‍💻) ================================================ FILE: optimization/speedster/docs/en/docs/installation.md ================================================ # Installation In this installation guide we will learn: - [Quick installation](#quick-installation) of `Speedster` with pip **(Recommended)** - [Selective installation](#optional-selective-installation-of-speedster-requirements) of the requirements **(Optional)** - [Installation](#optional-download-docker-images-with-frameworks-and-optimizers) with Docker **(Optional)** - [Set up Speedster on custom DL devices](#set-up-speedster-on-custom-dl-devices) to run models on Google TPUs and AWS Inferentia Chips ## Quick installation You can easily install `Speedster` using pip. pip install speedster Then make sure to install all the available deep learning compilers: python -m nebullvm.installers.auto_installer --compilers all !!! info If you want to optimize PyTorch or HuggingFace models, PyTorch must be pre-installed in the environment before using the auto-installer, please install it from [this](https://pytorch.org/get-started/locally/) link. Moreover, for Mac computers with M1/M2 processors, please use a conda environment, or you may run into problems when installing some of the deep learning compilers. Great, now you are ready to accelerate your model 🚀 Please visit the following pages to get started based on the DL framework of your input model: - [Getting started with PyTorch optimization](getting_started/pytorch_getting_started.md) - [Getting started with 🤗 Hugging Face optimization](getting_started/hf_getting_started.md) - [Getting started with Stable Diffusion optimization](getting_started/diffusers_getting_started.md) - [Getting started with TensorFlow/Keras optimization](getting_started/tf_getting_started.md) - [Getting started with ONNX optimization](getting_started/onnx_getting_started.md) ## (Optional) Selective installation of Speedster requirements By default, the `auto_installer` installs all the DL frameworks and compilers supported by `Speedster`. However, some of these may not be relevant to your use case. In this section, we explain how you can customize the installation of these libraries, avoiding those that are not needed. To customize the libraries installation you have two options: - [Use the auto-installer (recommended)](#use-the-auto-installer-recommended) - [Install the libraries manually](#manual-installation) ### Use the auto-installer (recommended) To understand how to selectively install your preferred libraries, let's examine the auto-installer API: ```bash python -m nebullvm.installers.auto_installer --frameworks --extra-backends --compilers ``` !!! Description === "--frameworks" `frameworks` is used to specify the deep learning framework of your input model. The supported frameworks are `torch`, `tensorflow`, `onnx`, `huggingface` and `diffusers`. - if you want to optimize a model with a single DL framework, the code is as follows (example below for HuggingFace): ```python python -m nebullvm.installers.auto_installer --frameworks huggingface ``` Please remember that for PyTorch optimization, you should pre-install PyTorch from the official [repo](https://pytorch.org/get-started/locally/). - if you want to optimize models in multiple input frameworks, you can include them separated with a space: ```python python -m nebullvm.installers.auto_installer --frameworks tensorflow torch ``` - If you want to include all the frameworks, you can use `all` as the argument: ```python python -m nebullvm.installers.auto_installer --frameworks all ``` Default: `all`. === "--extra-backends" After entering your input model, `Speedster` converts the input model from its original framework into an intermediate framework to be used during the optimization; we call these intermediate frameworks "backends." To learn more, see the section [Model Converter](https://docs.nebuly.com/Speedster/key_concepts/) in the docs. This conversion allows `Speedster` to apply all optimization techniques without being constrained by the input framework of your model. The supported backends are `torch`, `tensorflow` and `onnx`. You can specify multiple backends by separating them with a space. - For example, if you want to install TensorFlow and ONNX as backends of an HugginFace model, the code is as follows: ```python python -m nebullvm.installers.auto_installer --frameworks huggingface --extra-backends tensorflow onnx ```python - If you want to install all the backends supported by the selected frameworks, you can use `all` as the argument. - If you don't want to install extra backends, you can set `--extra-backends none`. The extra-backends that you choose must be compatible with at least one of the input frameworks you previously selected with the argument `—-frameworks`, please see the table below to see the compatibility matrix. Default: `all`. === "--compilers" `compilers` is used to specify the deep learning compilers to be installed. The supported compilers are: `deepsparse`, `tensor_rt`, `torch_tensor_rt`, `openvino` and `intel_neural_compressor`. The compilers must be compatible with at least one of the backends selected with the argument `—-extra-backends`, please see the table below to see the compatibility matrix. - You can specify multiple compilers by separating them with a space. For example: ```python --compilers deepsparse tensor_rt ``` will install DeepSparse and TensorRT. - If you want to install all the compilers supported by the selected frameworks/backends, you can use `all` as the argument. Speedster also supports `torchscript`, `tf_lite`, and `onnxruntime` as built-in; these are preinstalled with their respective backends, so there is no need to include them in the list. Speedster also supports `tvm`, which is currently not supported by the automatic installer and must be installed manually; see the next section if you wish to include it. Default: `all`. Let's see an example of how to use these three arguments: ```bash python -m nebullvm.installers.auto_installer --frameworks torch --extra-backends all --compilers all ``` This command will setup your environment to optimize PyTorch models, and will install all PyTorch supported backends and compilers. The following table shows the supported combinations of frameworks, backends and compilers that you can install with the auto-installer: | Framework | Extra Backends | Compilers | |--------------|---------------------------|-------------------------------------------------------------------------| | PyTorch | ONNX | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor | | TensorFlow | ONNX | TensorRT, OpenVINO | | ONNX | / | TensorRT, OpenVINO | | Hugging Face | PyTorch, TensorFlow, ONNX | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor | | Diffusers | PyTorch, ONNX | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor | !!! info Hugging Face models can be of two types, PyTorch-based or TensorFlow-based. For PyTorch-based models, it is necessary to include `torch` as an extra-backend. For TensorFlow-based models, you must include `tensorflow` as an extra-backend. ### Manual installation If you want to manually install the requirements, this section collects links to the official installation guides for all frameworks and compilers supported by `Speedster`. #### Deep Learning frameworks/backends - PyTorch: https://pytorch.org/get-started/locally/ - TensorFlow: https://www.tensorflow.org/install - ONNX: https://github.com/onnx/onnx#installation - HuggingFace: https://huggingface.co/transformers/installation.html - Diffusers: https://github.com/huggingface/diffusers#installation #### Deep Learning compilers - DeepSparse: https://github.com/neuralmagic/deepsparse#installation - TensorRT: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html - Torch TensorRT: https://pytorch.org/TensorRT/getting_started/installation.html#installation - ONNXRuntime: https://onnxruntime.ai/docs/install/#python-installs - OpenVINO: https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#step-4-install-the-package - Intel Neural Compressor: https://github.com/intel/neural-compressor#installation - Apache TVM: https://tvm.apache.org/docs/install/index.html #### Other requirements - tf2onnx: https://github.com/onnx/tensorflow-onnx#installation (Install it if you want to convert TensorFlow models to ONNX) - polygraphy: https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy#installation (Install it if you want to use TensorRT) - onnx-simplifier: https://github.com/daquexian/onnx-simplifier#python-version (Install it if you want to use TensorRT) - onnx_graphsurgeon: https://github.com/NVIDIA/TensorRT/tree/master/tools/onnx-graphsurgeon#installation (Install it if you want to use TensorRT with Stable Diffusion) - onnxmltools: https://github.com/onnx/onnxmltools#install (Install it if you want to convert models to ONNX) ## (Optional) Download Docker images with frameworks and optimizers Instead of installing the frameworks and compilers needed for optimization, which can be a time-consuming task, you can simply download a Docker container with all compilers preinstalled. To pull up the Docker image, run: docker pull nebulydocker/nebullvm:latest and then run and access the Docker with: docker run -ti --gpus=all nebulydocker/nebullvm:latest After optimizing the model, you may decide to deploy it to production. Note that you need to have the deep learning compiler used to optimize the model and other components inside the production Docker. For this reason, we have created several versions of the Docker nebullvm container in the [Docker Hub](https://hub.docker.com/repository/docker/nebulydocker/nebullvm), each containing only one compiler. Pull the image with the compiler that has optimized your model! ## Set up Speedster on custom DL devices From version `0.10.0`, Speedster supports optimization of PyTorch models on `Google TPUs` and `AWS Inferentia` chips. For these devices, the user must ensure that the required libraries are installed on the machine. The following sections describe how to install the required libraries for each device. ### Google TPUs In order to use a TPU, you must request a TPU-enabled VM from Google Cloud. You can consult the [official documentation](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en) for more information about how to create a TPU VM and how to get started with PyTorch on TPUs. To use Speedster on Google TPUs, we will use the [`torch_xla`](https://github.com/pytorch/xla) library, which is already preinstalled in all the Google Cloud TPU VMs, you will find it in the base Python3 environment. After creating the VM, you can follow these steps to set up Speedster: - Check that the `torch_xla` library is installed in the base Python3 environment. You can do this by running `python3 -c "import torch_xla; print(torch_xla.__version__)"` in the VM console; - Set TPU runtime configuration as explained in the [official documentation](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en#set_tpu_runtime_configuration); - [Optional] Check that the TPU is working by running the [official example](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en#perform_a_simple_calculation); - Install Speedster by running `pip install speedster`. It's not required to install the deep learning compilers in this case, since they are not supported on TPUs. You are now ready to use Speedster on TPUs! Speedster will automatically detect the TPU device and will use the `torch_xla` library to optimize the model, comparing its performances with the original model running on the CPU. ### AWS Inferentia For AWS Inferentia, you must first create an AWS EC2 instance with the `inf1` instance type. You can find more information about `inf1` instances in the [official documentation](https://aws.amazon.com/it/ec2/instance-types/inf1/). !!! info AWS has recently released the `inf2` instance type, which is a more powerful version of `inf1`. For now `inf2` instances are only available in private preview, you can request them directly to AWS by filling this [form](https://pages.awscloud.com/EC2-Inf2-Preview.html). To use Speedster on AWS Inferentia, we will use the [`torch-neuron`](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-setup.html) library, that must be manually installed on `inf1` instances (on `inf2`instances it's already preinstalled if you use the PyTorch DLAMI provided by AWS). You can find here the full guides to set up the EC2 instances and install the required libraries: - `inf1`: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/setup/pytorch-install.html#install-neuron-pytorch - `inf2`: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/setup/pytorch-install.html#pytorch-neuronx-install After creating the EC2 instance and installing `torch_neuron`, you can follow these steps to set up Speedster: - Check that the `torch_neuron` library is installed, you can do this by running `python -c "import torch_neuron; print(torch_neuron.__version__)"` in the console (if using `inf1` instances, otherwise change `torch_neuron` with `torch_neuronx`); - Install Speedster by running `pip install speedster`. It's not required to install the deep learning compilers in this case, since they are not supported on AWS Inferentia. You are now ready to use Speedster on AWS Inferentia! Speedster will automatically detect the AWS Inferentia device and will use the `torch_neuron` library to optimize the model, comparing its performances with the original model running on the CPU. ================================================ FILE: optimization/speedster/docs/en/docs/key_concepts.md ================================================ # Key concepts In this section we are going to learn the architectural design of the 4 building blocks of `Speedster`. - [x] **Converter**: converts the input model from its original framework to the framework backends supported by Speedster, namely PyTorch, TensorFlow, and ONNX. This allows the Compressor and Optimizer modules to apply any optimization technique to the model. - [x] **Compressor**: applies various compression techniques to the model, such as pruning, knowledge distillation, or quantization-aware training. - [x] **Optimizer**: converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The compilers apply both post-training quantization techniques and graph optimizations, to produce compiled binary files. - [x] **Inference Learner**: takes the best performing compiled model and converts it to the same interface as the original input model. ![speedster_blocks](https://user-images.githubusercontent.com/42771598/213177175-a76908a2-5eef-4e82-9d54-0fc812131463.png) The **compressor** stage leverages the following open-source projects: - [Intel/neural-compressor](https://github.com/intel/neural-compressor): targeting to provide unified APIs for network compression technologies, such as low precision quantization, sparsity, pruning, knowledge distillation, across different deep learning frameworks to pursue optimal inference performance. - [SparseML](https://github.com/neuralmagic/sparseml): libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models. The **compiler stage** leverages the following open-source projects: - [Apache TVM](https://github.com/apache/tvm): open deep learning compiler stack for cpu, gpu and specialized accelerators. - [BladeDISC](https://github.com/alibaba/BladeDISC): end-to-end Dynamic Shape Compiler project for machine learning workloads. - [DeepSparse](https://github.com/neuralmagic/deepsparse): neural network inference engine that delivers GPU-class performance for sparsified models on CPUs. - [OpenVINO](https://github.com/openvinotoolkit/openvino): open-source toolkit for optimizing and deploying AI inference. - [ONNX Runtime](https://github.com/microsoft/onnxruntime): cross-platform, high performance ML inferencing and training accelerator - [TensorRT](https://github.com/NVIDIA/TensorRT): C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators. - [TFlite](https://github.com/tensorflow/tflite-micro) and [XLA](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): open-source libraries to accelerate TensorFlow models. ## Model converter !!! Definition The Converter converts the input model from its original input framework to the framework backends supported by `Speedster`. This conversion enables the Compressor and the Compiler modules to apply all the optimization techniques without being constrained by the framework of your input model. ![image info](images/converter.png) `Speedster` supports deep learning models in the following input frameworks: - Hugging Face - Diffusers - ONNX - PyTorch - TensorFlow `Speedster` now includes 3 backends: - **ONNX backend**, which supports models in any input framework. - **PyTorch backend**, which supports input models in PyTorch and ONNX and Hugging Face. - **TensorFlow backend**, which supports input models in TensorFlow and ONNX. As you notice, to date, not all cross-conversions from input frameworks to each `Speedster` backend are supported. Let's see a couple of examples to better understand the potenatiality of the Converter block: 1. PyTorch model as input: first of all Speedster will try the compilers available in the PyTorch backend pipeline, then it will convert it to ONNX and will try also the ones available in the ONNX backend optimization pipeline. Finally, the best one among them will be chosen and returned as the optimized model in your input framework (in this case PyTorch). 2. HuggingFace model as input: Let's assume that for your specific use case, the best optimization technique is a specific type of dynamic quantization only supported by PyTorch. If you feed a Hugging Face model into Speedster, the Converter will first transform your model into a PyTorch model. Speedster will then quantize it and finally return it as an Hugging Face model. ## Compressor The compressor applies various compression techniques to the model: - Block-wise un/structured sparsity (🎉 launched in 0.4.0 🎉) - Knowledge distillation (to be supported) - Layer replacement (to be supported) - Low-rank compression (to be supported) - Quantization-aware training (to be supported) - SparseML (🎉 launched in 0.4.0 🎉) ![image info](images/compressor.png) ## Compiler The Compiler block converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The different DL compilers perform both the low-level optimizations, which mostly consist of various quantization techniques, and graph optimizations. Finally, the model is compiled into binary. ![image info](images/compiler.png) Supported deep learning compilers: - Apache TVM - BladeDISC (🎉 launched in 0.4.0 🎉) - DeepSparse (🎉 launched in 0.4.0 🎉) - MLIR (open pull request 👩‍💻) - ONNX Runtime - OpenVINO - TensorRT - TF Lite / XLA - TorchScript Supported low-level optimizations: - Static quantization - Dynamic quantization - Half-precision - Low-bit quantization on TVM (to be supported) ## Inference learner The Learner, or Inference Learner, selects the most performing compiled model on your hardware and converts it to the same interface as the original input model. ![image info](images/learner.png) ================================================ FILE: optimization/speedster/docs/en/docs/notebooks.md ================================================ # Notebooks In this section you can find optimization notebooks for multiple DL input models: - HuggingFace - Diffusers - ONNX - Pytorch - Tensorflow Please check out notebooks and tutorials on GitHub at [this](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) link. ================================================ FILE: optimization/speedster/docs/en/docs/overview.md ================================================ # Overview `Speedster` is an open-source module designed to accelerate AI inference in just a few lines of code. The library allows you to seamlessy modulate the inference performances of your AI models in terms of latency, throughput, model size, accuracy, cost and automatically applies the best set of optimization techniques along the software to hardware stack to meet your targets. `Speedster` makes it easy to combine optimization techniques across the whole software to hardware stack, delivering best in class speed-ups. If you like the idea, give us a star to support the project ⭐ ![speedster](https://user-images.githubusercontent.com/53374883/225600620-1cd84073-d9b3-43d1-84fa-c3e6c25eb915.png) The core `Speedster` workflow consists of 3 steps: - [x] **Select**: input your model in your preferred DL framework and express your preferences regarding: - Accuracy loss: do you want to trade off a little accuracy for much higher performance? - Optimization time: stellar accelerations can be time-consuming. Can you wait, or do you need an instant answer? - [x] **Search**: the library automatically tests every combination of optimization techniques across the software-to-hardware stack (sparsity, quantization, compilers, etc.) that is compatible with your needs and local hardware. - [x] **Serve**: finally, `Speedster` chooses the best configuration of optimization techniques and returns an accelerated version of your model in the DL framework of your choice (just on steroids 🚀). Now you are ready to start accelerating your models, visit the [Installation](installation.md) section to start right away! ================================================ FILE: optimization/speedster/docs/en/docs/telemetry.md ================================================ # Telemetry `Speedster` is a young and rapidly evolving open-source project. There is plenty of room for improvement for Speedster to make your model achieve the very best performance on your hardware... and you may still find some bugs in the code 🪲 Contributions to this OSS project are warmly welcomed 🤗. We encourage you to check out the Contribution guidelines to understand how you can become an active contributor of the source code. ## Sharing feedback to improve Speedster Open source is a unique resource for sharing knowledge and building great projects collaboratively with the OSS community. To support the continued development, upon installation of Speedster you could share the information strictly necessary to improve the performance of this open-source project and facilitate bug detection and fixing. More specifically, you will foster project enhancement by sharing details of the optimization techniques used with Speedster and the performance achieved on your model and hardware. **Which data do we collect?** We make sure to collect as little data as possible to improve the open-source project: - basic information about the environment - basic information about the optimization Please find below an example of telemetry collection: ```python { "nebullvm_version": "0.6.0", "app_version": "0.0.1", "model_id": "e33a1bbf-fcfd-4f5a-81c9-a9154c7e9343_-7088971112344091114", "model_metadata": { "model_name": "ResNet", "model_size": "102.23 MB", "framework": "torch" }, "hardware_setup": { "cpu": "Apple M1 Pro", "operative_system": "Darwin", "ram": "17.18 GB" }, "optimizations": [ { "compiler": "torch", "technique": "original", "latency": 0.03 }, { "compiler": "NUMPY_onnxruntime", "technique": "none", "latency": 0.01 } ], "ip_address": "1.1.1.1" } ``` **How to opt-out?** You can simply opt-out from telemetry collection by setting the environment variable `SPEEDSTER_DISABLE_TELEMETRY to 1`. **Should I opt out?** Being open-source, we have very limited visibility into the use of the tool unless someone actively contacts us or opens an issue on GitHub. We would appreciate it if you would maintain telemetry, as it helps us improve the source code. In fact, it brings increasing value to the project and helps us to better prioritize feature development. We understand that you may still prefer not to share telemetry data and we respect that desire. Please follow the steps above to disable data collection. ================================================ FILE: optimization/speedster/docs/en/mkdocs.yaml ================================================ site_name: Speedster docs_dir: ./docs nav: - Overview: overview.md - Installation: installation.md - Getting started: - PyTorch: getting_started/pytorch_getting_started.md - 🤗 HuggingFace: getting_started/hf_getting_started.md - 🧨 Stable Diffusion: getting_started/diffusers_getting_started.md - TensorFlow/Keras: getting_started/tf_getting_started.md - ONNX: getting_started/onnx_getting_started.md - Notebooks: notebooks.md - Key concepts: key_concepts.md - Supported hardware: hardware.md - Advanced options: advanced_options.md - Benchmarks: benchmarks.md - Telemetry: telemetry.md ================================================ FILE: optimization/speedster/notebooks/README.md ================================================ # **Jupyter notebooks** This folder contains notebooks showing how to use the `Speedster` app to optimize several models. The following frameworks are supported: - PyTorch - HuggingFace - Diffusers - Tensorflow - ONNX Examples of how to use `Speedster` are shown for each of these frameworks. In each folder we provide links to google colab where you can easily test the notebooks. If you want to test them on your own hardware, you can follow the guide below. ## 1. Setup To test notebooks, we have to create an environment where all the required dependencies are installed. First of all, clone the `nebullvm` repository: ``` git clone https://github.com/nebuly-ai/nebullvm.git ``` Next, navigate to the repo's root directory: ``` cd nebullvm ``` After cloning the repository there are two options: we can either install `Speedster` in a local environment or use a ready-to-use docker container. ### a. Using a local environment Install `Speedster` library: ``` pip install speedster ``` Install deep learning compilers: ``` python -m nebullvm.installers.auto_installer \ --frameworks all --compilers all ``` You can find additional options and details on the official [installation guide](https://docs.nebuly.com/modules/speedster/installation). After everything has been installed, you can start a jupyter session with the following command: ``` jupyter notebook --allow-root --port 8888 ``` And navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888` Use the token listed in the output from running the jupyter command to log in, for example: `http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b` You can finally navigate to the `notebooks/speedster` folder and then to the folder of the framework that you want to try and start a notebook. ### b. Using a Docker container Another very easy way to test the following notebooks is by using one of the docker containers released on [dockerhub](https://hub.docker.com/r/nebulydocker/nebullvm). Pull the most up-to-date container image that has all compilers and their dependencies preinstalled: ``` docker pull nebulydocker/nebullvm:latest ``` Once pulled, the container can be launched with the following command: ``` docker run --rm --gpus all -ti -p 8888:8888 -v $PWD:/nebullvm nebulydocker/nebullvm:latest ``` The `-v` option in the command above allows to persist all the changes that will be done to the notebooks inside the container. Please note that, in order to enable gpu inside docker, you have to ensure that nvidia docker is installed. Please follow the "Setting up NVIDIA Container Toolkit" part from the official [installation guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker). You can then check that the gpu can be seen inside the container by running `nvidia-smi` inside it, and checking that your gpu appears in the output. Inside the container, we can then navigate to the notebooks folder: ``` cd /nebullvm/notebooks/speedster ``` We can then run a jupyter session with the following command: ``` jupyter notebook --allow-root --ip 0.0.0.0 --port 8888 ``` And navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888` Use the token listed in the output from running the jupyter command to log in, for example: `http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b` You can finally navigate to the folder of the framework that you want to try and start a notebook. ## 2. Contributions At Nebuly we are always eager to see how our library manages to optimise more and more models. If you test nebullvm on your model and this is not already present among the notebooks, feel free to open a PR for us to add your notebook to the repository! ================================================ FILE: optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "ef331be9", "metadata": { "id": "ef331be9" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f260653a", "metadata": { "id": "f260653a" }, "source": [ "# Accelerate Stable Diffusion with Speedster\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "8bdf3af5", "metadata": { "id": "8bdf3af5" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of Stable Diffusion inference using the Speedster module from the open-source library nebullvm. In the first section we will try using `Speedster` with the default configuration, then we will explore a more advanced option that involves the TensorRT plugins, that allow to accelerate Stable Diffusion further on GPU.\n", "\n", "Let's jump to the code." ] }, { "cell_type": "markdown", "id": "cXXh1ifQ13mH", "metadata": { "id": "cXXh1ifQ13mH" }, "source": [ "# Installation" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks diffusers --compilers all" ] }, { "attachments": {}, "cell_type": "markdown", "id": "c2ab3de7", "metadata": {}, "source": [ "# Environment check (GPU only)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "61a1a445", "metadata": {}, "source": [ "**Please skip this section if you don't have a GPU**" ] }, { "attachments": {}, "cell_type": "markdown", "id": "e2784bb8", "metadata": {}, "source": [ "If you want to optimize Stable Diffusion on a Nvidia GPU, in order to work properly, the following requirements must be installed on your machine:\n", "- `CUDA>=12.0`\n", "- `tensorrt>=8.6.0`\n", "- `torch<=1.13.1`" ] }, { "attachments": {}, "cell_type": "markdown", "id": "e3bc8b4d", "metadata": {}, "source": [ "From TensorRT 8.6, all the tensorrt pre-built wheels released by nvidia support only `CUDA>=12.0`. Speedster will install `tensorrt>=8.6.0` automatically in the auto-installer only if it detects CUDA>=12.0, otherwise it will install `tensorrt==8.5.3.1`. In that case, you will have to upgrade your CUDA version and then to upgarde tensorrt to 8.6.0 or above to execute this notebook.\n", "\n", "There should be a way to run TensorRT 8.6 also with CUDA 11, but it requires installing TensorRT in a different way, you can check this issue: https://github.com/NVIDIA/TensorRT/issues/2773. Otherwise, we highly suggest to just upgrade to CUDA 12.\n", "\n", "For now PyTorch>=2.0.0 is not supported due to an [issue](https://github.com/pytorch/pytorch/issues/97262) in the conversion to onnx, so until they fix it you must have torch<=1.13.1 to optimize Stable Diffusion successfully." ] }, { "attachments": {}, "cell_type": "markdown", "id": "ec2267f0", "metadata": {}, "source": [ "First of all, Let's check the CUDA version installed on the machine" ] }, { "cell_type": "code", "execution_count": null, "id": "82b78585", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import subprocess\n", "\n", "if torch.cuda.is_available():\n", " cuda_version = subprocess.check_output([\"nvidia-smi\"])\n", " cuda_version = int(cuda_version.decode(\"utf-8\").split(\"\\n\")[2].split(\"|\")[-2].split(\":\")[-1].strip().split(\".\")[0])\n", " assert cuda_version >= 12, (\"This notebook requires CUDA>=12.0 to be executed, please upgrade your CUDA version.\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "015cfa92", "metadata": {}, "source": [ "If you have CUDA<12.0, you can upgrade it at this link: https://developer.nvidia.com/cuda-downloads" ] }, { "attachments": {}, "cell_type": "markdown", "id": "563779e6", "metadata": {}, "source": [ "Then, let's check the tensorrt version installed on the platform. Stable Diffusion optimization is supported starting from `tensorrt==8.6.0`" ] }, { "cell_type": "code", "execution_count": null, "id": "e385021d", "metadata": {}, "outputs": [], "source": [ "import tensorrt\n", "from nebullvm.tools.utils import check_module_version\n", "\n", "if torch.cuda.is_available():\n", " assert check_module_version(tensorrt, \"8.6.0\"), (\"This notebook can be run only with tensorrt>=8.6.0, if using an older version you could have issues during the optimization. Please upgrade your version.\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "61da505b", "metadata": {}, "source": [ "If you have an older version, after ensuring you have `CUDA>=12.0` installed, you can upgrade your TensorRT version by running:\n", "```\n", "pip install -U tensorrt\n", "```" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3876bea4", "metadata": {}, "source": [ "Finally, let's check the PyTorch version" ] }, { "cell_type": "code", "execution_count": null, "id": "db83853f", "metadata": {}, "outputs": [], "source": [ "import torch\n", "\n", "from nebullvm.tools.utils import check_module_version\n", "\n", "assert check_module_version(torch, max_version=\"1.13.1+cu117\"), (\"This notebook can be run only with torch<=1.13.1, if using an older version you could have issues during the optimization. Please downgrade your version.\")" ] }, { "cell_type": "markdown", "id": "73072506", "metadata": { "id": "73072506" }, "source": [ "## Model and Dataset setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "aeb2c521", "metadata": {}, "source": [ "Once we have ensured that the the required libraries are installed, we have to choose the version of Stable Diffusion we want to optimize, speedster officially supports the most used versions:\n", "- `CompVis/stable-diffusion-v1-4`\n", "- `runwayml/stable-diffusion-v1-5`\n", "- `stabilityai/stable-diffusion-2-1-base`\n", "- `stabilityai/stable-diffusion-2-1` (only on gpus with at least 22GB of Memory, if you want to try with a GPU with a lower memory, you have to uncomment `pipe.enable_attention_slicing()` in the cell below)\n", "\n", "Other Stable Diffusion versions from the Diffusers library should work but have never been tested. If you try a version not included among these and it works, please feel free to report it to us on [Discord](https://discord.com/invite/RbeQMu886J) so we can add it to the list of supported versions. If you try a version that does not work, you can open an issue and possibly a PR on [GitHub](https://github.com/nebuly-ai/nebullvm/issues)." ] }, { "attachments": {}, "cell_type": "markdown", "id": "e4d55115", "metadata": { "id": "e4d55115" }, "source": [ "For this notebook, we are going to select Stable Diffusion 1.4. Let's download and load it using the diffusers API:" ] }, { "cell_type": "code", "execution_count": null, "id": "d633cf21", "metadata": { "id": "d633cf21", "scrolled": true }, "outputs": [], "source": [ "import torch\n", "from diffusers import StableDiffusionPipeline\n", "\n", "# Select Stable Diffusion version\n", "model_id = \"CompVis/stable-diffusion-v1-4\"\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "if device == \"cuda\":\n", " # On GPU we load by default the model in half precision, because it's faster and lighter.\n", " pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\n", " # pipe.enable_attention_slicing() # Uncomment for stable-diffusion-2.1 on gpus with 16GB of memory like V100-16GB and T4\n", "else:\n", " pipe = StableDiffusionPipeline.from_pretrained(model_id)\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "11aa0739", "metadata": { "id": "11aa0739" }, "source": [ "Let's now create an example dataset with some random sentences, that will be used later for the optimization process" ] }, { "cell_type": "code", "execution_count": null, "id": "cbbfeeb2", "metadata": { "id": "cbbfeeb2" }, "outputs": [], "source": [ "input_data = [\n", " \"a photo of an astronaut riding a horse on mars\",\n", " \"a monkey eating a banana in a forest\",\n", " \"white car on a road surrounded by palm trees\",\n", " \"a fridge full of bottles of beer\",\n", " \"madara uchiha throwing asteroids against people\"\n", "]" ] }, { "cell_type": "markdown", "id": "17040431", "metadata": { "id": "17040431" }, "source": [ "## Speed up inference with Speedster" ] }, { "cell_type": "markdown", "id": "44ddc21d", "metadata": { "id": "44ddc21d" }, "source": [ "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`." ] }, { "cell_type": "code", "execution_count": null, "id": "f9d934f6", "metadata": { "id": "f9d934f6" }, "outputs": [], "source": [ "from speedster import optimize_model, save_model, load_model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "2799e3e3", "metadata": {}, "source": [ "Let's move the pipe back to CPU to save up GPU memory, `Speedster` will automatically move it back to GPU when required." ] }, { "cell_type": "code", "execution_count": null, "id": "45220cf0", "metadata": {}, "outputs": [], "source": [ "import gc\n", "\n", "# Move the pipe back to cpu\n", "pipe.to(\"cpu\")\n", "\n", "# Clean memory\n", "torch.cuda.empty_cache()\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "76248033", "metadata": { "id": "76248033" }, "source": [ "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape." ] }, { "attachments": {}, "cell_type": "markdown", "id": "75b339c3", "metadata": {}, "source": [ "**Optimisation of stable diffusion requires a lot of RAM. If you are running this notebook on google colab, make sure to use the high RAM option, otherwise the kernel may crash. If the kernel crashes also when using the high RAM option, please try adding also `\"torchscript\"` to the `ignore_compilers` list. \n", "If running on GPU, the optimization requires at least 16GB og GPU memory to exploit the best techniques for optimizing the model, otherwise it may fail with a Memory Error**." ] }, { "cell_type": "code", "execution_count": null, "id": "zPC_EDwEJIM0", "metadata": { "id": "zPC_EDwEJIM0" }, "outputs": [], "source": [ "optimized_model = optimize_model(\n", " model=pipe,\n", " input_data=input_data,\n", " optimization_time=\"unconstrained\",\n", " ignore_compilers=[\"torch_tensor_rt\", \"tvm\"], # Some compilers have issues with Stable Diffusion, so it's better to skip them.\n", " metric_drop_ths=0.2,\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "fdae59d2", "metadata": {}, "source": [ "If running on GPU, here you should obtain a speedup of about 124% on the UNet. We run the optimization on a **3090Ti** and here are our results:\n", "- **Original Model (PyTorch, fp16): 51,557 ms/batch**\n", "- **Optimized Model (TensorRT, fp16): 23,055 ms/batch**\n", "\n", "If the optimized model you obtained is not a TensorRT one, probably there was an error during the optimization. If running on colab, it could happen that the standard gpu is not enough to run the optimization, so we suggest to select a premium gpu with more memory.\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "af9f86ac", "metadata": {}, "source": [ "If everything worked correctly, let's check the output of the optimized model" ] }, { "cell_type": "code", "execution_count": null, "id": "7b640885", "metadata": {}, "outputs": [], "source": [ "test_prompt = \"futuristic llama with a cyberpunk city on the background\"\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fa443637", "metadata": {}, "outputs": [], "source": [ "optimized_model(test_prompt).images[0]" ] }, { "attachments": {}, "cell_type": "markdown", "id": "6e5b3b21", "metadata": { "id": "6e5b3b21" }, "source": [ "Let's run the prediction 10 times to calculate the average response time of the original model." ] }, { "cell_type": "code", "execution_count": null, "id": "09170c78", "metadata": {}, "outputs": [], "source": [ "if device == \"cuda\":\n", " pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\n", " # pipe.enable_attention_slicing() # Uncomment for stable-diffusion-2.1 on gpus with 16GB of memory like V100-16GB and T4\n", "else:\n", " pipe = StableDiffusionPipeline.from_pretrained(model_id)\n", "\n", "pipe.to(device)" ] }, { "cell_type": "code", "execution_count": null, "id": "d3bc5c98", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d3bc5c98", "outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681" }, "outputs": [], "source": [ "import time\n", "\n", "times = []\n", "\n", "# Warmup for 2 iterations\n", "for _ in range(2):\n", " with torch.no_grad():\n", " final_out = pipe(test_prompt).images[0]\n", "\n", "# Benchmark\n", "for _ in range(8):\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = pipe(test_prompt).images[0]\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)\n", "print(f\"Average response time for original Stable Diffusion 1.4: {original_model_time} s\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3db0a7a1", "metadata": { "id": "3db0a7a1" }, "source": [ "Let's run the prediction 10 times to calculate the average response time of the optimized model." ] }, { "cell_type": "code", "execution_count": null, "id": "a3e83997", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a3e83997", "outputId": "7a416b14-f170-4df9-d416-026f06a7d980" }, "outputs": [], "source": [ "times = []\n", "\n", "for _ in range(2):\n", " with torch.no_grad():\n", " final_out = optimized_model(test_prompt).images[0]\n", "\n", "# Benchmark\n", "for _ in range(8):\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = optimized_model(test_prompt).images[0]\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)\n", "print(f\"Average response time for optimized Stable Diffusion 1.4: {optimized_model_time} s\")" ] }, { "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\", pipe=pipe)" ] }, { "cell_type": "markdown", "id": "cb234e5e", "metadata": { "id": "cb234e5e" }, "source": [ "Great! Was it easy? How are the results? Do you have any comments?\n", "Share your optimization results and thoughts with our community on Discord, where we chat about Speedster and AI acceleration.\n", "\n", "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n", "\n", "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord." ] }, { "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "premium", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "vscode": { "interpreter": { "hash": "4ca44071b2152bc556aa4c839392f76fd4b80aa39d34257f2d304fa0d1d8b7d9" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/diffusers/Readme.md ================================================ # **Diffusers Optimization** > :warning: In order to work properly, the diffusers optimization requires `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`. For additional details, please look the docs [here](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/). This section contains all the available notebooks that show how to leverage Speedster to optimize Diffusers models. ## Notebooks: | Notebook | Description | | |:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Accelerate Diffusers Stable Diffusion](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb) | Show how to optimize with Speedster the Stable Diffusion models from Diffusers. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb) | ## Diffusers API quick view: ``` python import torch from speedster import optimize_model from diffusers import StableDiffusionPipeline # Load Stable Diffusion 1.4 as example model_id = "CompVis/stable-diffusion-v1-4" device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cuda": # On GPU we load by default the model in half precision, because it's faster and lighter. pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16) else: pipe = StableDiffusionPipeline.from_pretrained(model_id) # Create some example input data input_data = [ "a photo of an astronaut riding a horse on mars", "a monkey eating a banana in a forest", "white car on a road surrounded by palm trees", "a fridge full of bottles of beer", "madara uchiha throwing asteroids against people" ] # Run Speedster optimization optimized_model = optimize_model( model=pipe, input_data=input_data, optimization_time="unconstrained", ignore_compilers=["torch_tensor_rt", "tvm"], metric_drop_ths=0.1, ) # Try the optimized model test_prompt = "futuristic llama with a cyberpunk city on the background" res = optimized_model(test_prompt).images[0] ``` ================================================ FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "ef331be9", "metadata": { "id": "ef331be9" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f260653a", "metadata": { "id": "f260653a" }, "source": [ "# Accelerate Hugging Face PyTorch BERT with Speedster\n" ] }, { "cell_type": "markdown", "id": "8bdf3af5", "metadata": { "id": "8bdf3af5" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n", "\n", "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "id": "d527d63b", "metadata": { "id": "d527d63b" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "id": "cXXh1ifQ13mH", "metadata": { "id": "cXXh1ifQ13mH" }, "source": [ "# Installation" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all" ] }, { "cell_type": "markdown", "id": "73072506", "metadata": { "id": "73072506" }, "source": [ "## Model and Dataset setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "cf24c4c4", "metadata": {}, "source": [ "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" ] }, { "cell_type": "code", "execution_count": null, "id": "1cf8ff74", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", "\n", "if os.path.exists(tensorrt_path):\n", " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", "else:\n", " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" ] }, { "cell_type": "markdown", "id": "e4d55115", "metadata": { "id": "e4d55115" }, "source": [ "We chose BERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub." ] }, { "cell_type": "code", "execution_count": null, "id": "d633cf21", "metadata": { "id": "d633cf21", "scrolled": true }, "outputs": [], "source": [ "import torch\n", "from transformers import BertTokenizer, BertModel\n", "\n", "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", "model = BertModel.from_pretrained('bert-base-uncased', torchscript=True)\n", "\n", "# Move the model to gpu if available and set eval mode\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device).eval()" ] }, { "cell_type": "markdown", "id": "11aa0739", "metadata": { "id": "11aa0739" }, "source": [ "Let's create an example dataset with some random sentences" ] }, { "cell_type": "code", "execution_count": null, "id": "cbbfeeb2", "metadata": { "id": "cbbfeeb2" }, "outputs": [], "source": [ "import random\n", "\n", "sentences = [\n", " \"Mars is the fourth planet from the Sun.\",\n", " \"has a crust primarily composed of elements\",\n", " \"However, it is unknown\",\n", " \"can be viewed from Earth\",\n", " \"It was the Romans\",\n", "]\n", "\n", "len_dataset = 100\n", "\n", "texts = []\n", "for _ in range(len_dataset):\n", " n_times = random.randint(1, 30)\n", " texts.append(\" \".join(random.choice(sentences) for _ in range(n_times)))" ] }, { "cell_type": "code", "execution_count": null, "id": "a09f9424", "metadata": { "id": "a09f9424" }, "outputs": [], "source": [ "encoded_inputs = [tokenizer(text, return_tensors=\"pt\") for text in texts]" ] }, { "cell_type": "markdown", "id": "17040431", "metadata": { "id": "17040431" }, "source": [ "## Speed up inference with Speedster: no metric drop" ] }, { "cell_type": "markdown", "id": "44ddc21d", "metadata": { "id": "44ddc21d" }, "source": [ "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`." ] }, { "cell_type": "code", "execution_count": null, "id": "f9d934f6", "metadata": { "id": "f9d934f6" }, "outputs": [], "source": [ "from speedster import optimize_model, save_model, load_model" ] }, { "cell_type": "markdown", "id": "76248033", "metadata": { "id": "76248033" }, "source": [ "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape." ] }, { "cell_type": "code", "execution_count": null, "id": "zPC_EDwEJIM0", "metadata": { "id": "zPC_EDwEJIM0" }, "outputs": [], "source": [ "dynamic_info = {\n", " \"inputs\": [\n", " {0: 'batch', 1: 'num_tokens'},\n", " {0: 'batch', 1: 'num_tokens'},\n", " {0: 'batch', 1: 'num_tokens'},\n", " ],\n", " \"outputs\": [\n", " {0: 'batch', 1: 'num_tokens'},\n", " {0: 'batch'},\n", " ]\n", "}\n", "\n", "optimized_model = optimize_model(\n", " model=model,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "98c6ab09", "metadata": { "id": "98c6ab09" }, "outputs": [], "source": [ "import time\n", "\n", "# Move inputs to gpu if available\n", "encoded_inputs = [tokenizer(text, return_tensors=\"pt\").to(device) for text in texts]" ] }, { "cell_type": "markdown", "id": "6e5b3b21", "metadata": { "id": "6e5b3b21" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the original model." ] }, { "cell_type": "code", "execution_count": null, "id": "d3bc5c98", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d3bc5c98", "outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original DistilBERT: {original_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "12c2df98", "metadata": { "id": "12c2df98" }, "source": [ "Let's see the output of the original model" ] }, { "cell_type": "code", "execution_count": null, "id": "4892a905", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4892a905", "outputId": "68d9b65f-e2cc-4998-8047-c9091f977698" }, "outputs": [], "source": [ "model(**encoded_input)" ] }, { "cell_type": "markdown", "id": "3db0a7a1", "metadata": { "id": "3db0a7a1" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the optimized model." ] }, { "cell_type": "code", "execution_count": null, "id": "a3e83997", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a3e83997", "outputId": "7a416b14-f170-4df9-d416-026f06a7d980" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized BERT (no metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "0d884d61", "metadata": { "id": "0d884d61" }, "source": [ "Let's see the output of the optimized_model" ] }, { "cell_type": "code", "execution_count": null, "id": "75611b2e", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "75611b2e", "outputId": "035d5c6d-fd7a-4506-af09-befcf9dd3b2d" }, "outputs": [], "source": [ "optimized_model(**encoded_input)" ] }, { "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Speed up inference with Speedster: metric drop" ] }, { "cell_type": "markdown", "id": "7b1950d5", "metadata": { "id": "7b1950d5" }, "source": [ "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup" ] }, { "cell_type": "code", "execution_count": null, "id": "de5721d8", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "de5721d8", "outputId": "c9efff21-f963-47ff-e83d-a44615f90a10" }, "outputs": [], "source": [ "optimized_model = optimize_model(\n", " model=model,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", " metric_drop_ths=0.1,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "0fbfe6fa", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0fbfe6fa", "outputId": "ada293f5-9b54-4186-8e48-74b994d4b797" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original BERT: {original_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f89b7e6d", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f89b7e6d", "outputId": "51e497e1-a533-432d-d68e-b373f0ef69cb" }, "outputs": [], "source": [ "model(**encoded_input)" ] }, { "cell_type": "code", "execution_count": null, "id": "10d17b5c", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "10d17b5c", "outputId": "d5dc0acd-77e7-4054-b455-19343ff37951" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized BERT (metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6bf3d1fb", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6bf3d1fb", "outputId": "6163d8ba-254f-47d2-a468-a921622a15ba" }, "outputs": [], "source": [ "optimized_model(**encoded_input)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "cell_type": "markdown", "id": "cb234e5e", "metadata": { "id": "cb234e5e" }, "source": [ "Great! Was it easy? How are the results? Do you have any comments?\n", "Share your optimization results and thoughts with our community on Discord, where we chat about Speedster and AI acceleration.\n", "\n", "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n", "\n", "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord." ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "premium", "kernelspec": { "display_name": "nebullvm_new", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "vscode": { "interpreter": { "hash": "4fbc45cd27f7d363500c2e8640d9fdb717da4e1d8e4954a68e42b53d65ee27af" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "ef331be9", "metadata": { "id": "ef331be9" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f260653a", "metadata": { "id": "f260653a" }, "source": [ "# Accelerate Hugging Face PyTorch DistilBERT with Speedster\n" ] }, { "cell_type": "markdown", "id": "8bdf3af5", "metadata": { "id": "8bdf3af5" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n", "\n", "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "id": "d527d63b", "metadata": { "id": "d527d63b" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "id": "cXXh1ifQ13mH", "metadata": { "id": "cXXh1ifQ13mH" }, "source": [ "# Installation" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all" ] }, { "cell_type": "markdown", "id": "73072506", "metadata": { "id": "73072506" }, "source": [ "## Model and Dataset setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "cf24c4c4", "metadata": {}, "source": [ "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" ] }, { "cell_type": "code", "execution_count": null, "id": "1cf8ff74", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", "\n", "if os.path.exists(tensorrt_path):\n", " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", "else:\n", " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" ] }, { "cell_type": "markdown", "id": "e4d55115", "metadata": { "id": "e4d55115" }, "source": [ "We chose DistilBERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub." ] }, { "cell_type": "code", "execution_count": null, "id": "d633cf21", "metadata": { "id": "d633cf21", "scrolled": true }, "outputs": [], "source": [ "import torch\n", "from transformers import DistilBertTokenizer, DistilBertModel\n", "\n", "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "model = DistilBertModel.from_pretrained(\"distilbert-base-uncased\", torchscript=True)\n", "\n", "# Move the model to gpu if available and set eval mode\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device).eval()" ] }, { "cell_type": "markdown", "id": "11aa0739", "metadata": { "id": "11aa0739" }, "source": [ "Let's create an example dataset with some random sentences" ] }, { "cell_type": "code", "execution_count": null, "id": "cbbfeeb2", "metadata": { "id": "cbbfeeb2" }, "outputs": [], "source": [ "import random\n", "\n", "sentences = [\n", " \"Mars is the fourth planet from the Sun.\",\n", " \"has a crust primarily composed of elements\",\n", " \"However, it is unknown\",\n", " \"can be viewed from Earth\",\n", " \"It was the Romans\",\n", "]\n", "\n", "len_dataset = 100\n", "\n", "texts = []\n", "for _ in range(len_dataset):\n", " n_times = random.randint(1, 30)\n", " texts.append(\" \".join(random.choice(sentences) for _ in range(n_times)))" ] }, { "cell_type": "code", "execution_count": null, "id": "a09f9424", "metadata": { "id": "a09f9424" }, "outputs": [], "source": [ "encoded_inputs = [tokenizer(text, return_tensors=\"pt\") for text in texts]" ] }, { "cell_type": "markdown", "id": "17040431", "metadata": { "id": "17040431" }, "source": [ "## Speed up inference with Speedster: no metric drop" ] }, { "cell_type": "markdown", "id": "44ddc21d", "metadata": { "id": "44ddc21d" }, "source": [ "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`." ] }, { "cell_type": "code", "execution_count": null, "id": "f9d934f6", "metadata": { "id": "f9d934f6" }, "outputs": [], "source": [ "from speedster import optimize_model, save_model, load_model" ] }, { "cell_type": "markdown", "id": "76248033", "metadata": { "id": "76248033" }, "source": [ "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape." ] }, { "cell_type": "code", "execution_count": null, "id": "zPC_EDwEJIM0", "metadata": { "id": "zPC_EDwEJIM0" }, "outputs": [], "source": [ "dynamic_info = {\n", " \"inputs\": [\n", " {0: 'batch', 1: 'num_tokens'},\n", " {0: 'batch', 1: 'num_tokens'}\n", " ],\n", " \"outputs\": [\n", " {0: 'batch', 1: 'num_tokens'}\n", " ]\n", "}\n", "\n", "optimized_model = optimize_model(\n", " model=model,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "98c6ab09", "metadata": { "id": "98c6ab09" }, "outputs": [], "source": [ "import time\n", "\n", "# Move inputs to gpu if available\n", "encoded_inputs = [tokenizer(text, return_tensors=\"pt\").to(device) for text in texts]" ] }, { "cell_type": "markdown", "id": "6e5b3b21", "metadata": { "id": "6e5b3b21" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the original model." ] }, { "cell_type": "code", "execution_count": null, "id": "d3bc5c98", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d3bc5c98", "outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original DistilBERT: {original_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "12c2df98", "metadata": { "id": "12c2df98" }, "source": [ "Let's see the output of the original model" ] }, { "cell_type": "code", "execution_count": null, "id": "4892a905", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4892a905", "outputId": "68d9b65f-e2cc-4998-8047-c9091f977698" }, "outputs": [], "source": [ "model(**encoded_input)" ] }, { "cell_type": "markdown", "id": "3db0a7a1", "metadata": { "id": "3db0a7a1" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the optimized model." ] }, { "cell_type": "code", "execution_count": null, "id": "a3e83997", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a3e83997", "outputId": "7a416b14-f170-4df9-d416-026f06a7d980" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized DistilBERT (no metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "0d884d61", "metadata": { "id": "0d884d61" }, "source": [ "Let's see the output of the optimized_model" ] }, { "cell_type": "code", "execution_count": null, "id": "75611b2e", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "75611b2e", "outputId": "035d5c6d-fd7a-4506-af09-befcf9dd3b2d" }, "outputs": [], "source": [ "optimized_model(**encoded_input)" ] }, { "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Speed up inference with Speedster: metric drop" ] }, { "cell_type": "markdown", "id": "7b1950d5", "metadata": { "id": "7b1950d5" }, "source": [ "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup" ] }, { "cell_type": "code", "execution_count": null, "id": "de5721d8", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "de5721d8", "outputId": "c9efff21-f963-47ff-e83d-a44615f90a10" }, "outputs": [], "source": [ "optimized_model = optimize_model(\n", " model=model,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", " metric_drop_ths=0.1,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "0fbfe6fa", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0fbfe6fa", "outputId": "ada293f5-9b54-4186-8e48-74b994d4b797" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original DistilBERT: {original_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f89b7e6d", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f89b7e6d", "outputId": "51e497e1-a533-432d-d68e-b373f0ef69cb" }, "outputs": [], "source": [ "model(**encoded_input)" ] }, { "cell_type": "code", "execution_count": null, "id": "10d17b5c", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "10d17b5c", "outputId": "d5dc0acd-77e7-4054-b455-19343ff37951" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized DistilBERT (metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6bf3d1fb", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6bf3d1fb", "outputId": "6163d8ba-254f-47d2-a468-a921622a15ba" }, "outputs": [], "source": [ "optimized_model(**encoded_input)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "cell_type": "markdown", "id": "cb234e5e", "metadata": { "id": "cb234e5e" }, "source": [ "Great! Was it easy? How are the results? Do you have any comments?\n", "Share your optimization results and thoughts with our community on Discord, where we chat about Speedster and AI acceleration.\n", "\n", "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n", "\n", "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord." ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "premium", "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "ef331be9", "metadata": { "id": "ef331be9" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f260653a", "metadata": { "id": "f260653a" }, "source": [ "# Accelerate Hugging Face PyTorch GPT2 with Speedster\n" ] }, { "cell_type": "markdown", "id": "8bdf3af5", "metadata": { "id": "8bdf3af5" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n", "\n", "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "id": "d527d63b", "metadata": { "id": "d527d63b" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "id": "cXXh1ifQ13mH", "metadata": { "id": "cXXh1ifQ13mH" }, "source": [ "# Installation" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all" ] }, { "cell_type": "markdown", "id": "73072506", "metadata": { "id": "73072506" }, "source": [ "## Model and Dataset setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "cf24c4c4", "metadata": {}, "source": [ "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" ] }, { "cell_type": "code", "execution_count": null, "id": "1cf8ff74", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", "\n", "if os.path.exists(tensorrt_path):\n", " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", "else:\n", " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" ] }, { "cell_type": "markdown", "id": "e4d55115", "metadata": { "id": "e4d55115" }, "source": [ "We chose GPT2 as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub." ] }, { "cell_type": "code", "execution_count": null, "id": "d633cf21", "metadata": { "colab": { "background_save": true }, "id": "d633cf21", "scrolled": true }, "outputs": [], "source": [ "import torch\n", "from transformers import GPT2Tokenizer, GPT2Model\n", "\n", "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n", "model = GPT2Model.from_pretrained('gpt2', torchscript=True)\n", "\n", "# Move the model to gpu if available and set eval mode\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device).eval()" ] }, { "cell_type": "markdown", "id": "11aa0739", "metadata": { "id": "11aa0739" }, "source": [ "Let's create an example dataset with some random sentences" ] }, { "cell_type": "code", "execution_count": null, "id": "cbbfeeb2", "metadata": { "colab": { "background_save": true }, "id": "cbbfeeb2" }, "outputs": [], "source": [ "import random\n", "\n", "sentences = [\n", " \"Mars is the fourth planet from the Sun.\",\n", " \"has a crust primarily composed of elements\",\n", " \"However, it is unknown\",\n", " \"can be viewed from Earth\",\n", " \"It was the Romans\",\n", "]\n", "\n", "len_dataset = 100\n", "\n", "texts = []\n", "for _ in range(len_dataset):\n", " n_times = random.randint(1, 30)\n", " texts.append(\" \".join(random.choice(sentences) for _ in range(n_times)))" ] }, { "cell_type": "code", "execution_count": null, "id": "a09f9424", "metadata": { "colab": { "background_save": true }, "id": "a09f9424" }, "outputs": [], "source": [ "encoded_inputs = [tokenizer(text, return_tensors=\"pt\") for text in texts]" ] }, { "cell_type": "markdown", "id": "17040431", "metadata": { "id": "17040431" }, "source": [ "## Speed up inference with Speedster: no metric drop" ] }, { "cell_type": "markdown", "id": "44ddc21d", "metadata": { "id": "44ddc21d" }, "source": [ "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`." ] }, { "cell_type": "code", "execution_count": null, "id": "f9d934f6", "metadata": { "id": "f9d934f6" }, "outputs": [], "source": [ "from speedster import optimize_model, save_model, load_model" ] }, { "cell_type": "markdown", "id": "76248033", "metadata": { "id": "76248033" }, "source": [ "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape." ] }, { "cell_type": "code", "execution_count": null, "id": "zPC_EDwEJIM0", "metadata": { "id": "zPC_EDwEJIM0" }, "outputs": [], "source": [ "dynamic_info = {\n", " \"inputs\": [\n", " {0: 'batch', 1: 'num_tokens'},\n", " {0: 'batch', 1: 'num_tokens'}\n", " ],\n", " \"outputs\": [\n", " {0: 'batch', 1: 'num_tokens'},\n", " ] + [{0: 'batch', 2: 'num_tokens'} for i in range(24)]\n", "}\n", "\n", "optimized_model = optimize_model(\n", " model=model,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "98c6ab09", "metadata": { "id": "98c6ab09" }, "outputs": [], "source": [ "import time\n", "\n", "# Move inputs to gpu if available\n", "encoded_inputs = [tokenizer(text, return_tensors=\"pt\").to(device) for text in texts]" ] }, { "cell_type": "markdown", "id": "6e5b3b21", "metadata": { "id": "6e5b3b21" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the original model." ] }, { "cell_type": "code", "execution_count": null, "id": "d3bc5c98", "metadata": { "id": "d3bc5c98" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original GPT2: {original_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "12c2df98", "metadata": { "id": "12c2df98" }, "source": [ "Let's see the output of the original model" ] }, { "cell_type": "code", "execution_count": null, "id": "4892a905", "metadata": { "id": "4892a905" }, "outputs": [], "source": [ "model(**encoded_input)" ] }, { "cell_type": "markdown", "id": "3db0a7a1", "metadata": { "id": "3db0a7a1" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the optimized model." ] }, { "cell_type": "code", "execution_count": null, "id": "a3e83997", "metadata": { "id": "a3e83997" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized GPT2 (no metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "0d884d61", "metadata": { "id": "0d884d61" }, "source": [ "Let's see the output of the optimized_model" ] }, { "cell_type": "code", "execution_count": null, "id": "75611b2e", "metadata": { "id": "75611b2e" }, "outputs": [], "source": [ "optimized_model(**encoded_input)" ] }, { "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Speed up inference with Speedster: metric drop" ] }, { "cell_type": "markdown", "id": "7b1950d5", "metadata": { "id": "7b1950d5" }, "source": [ "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup" ] }, { "cell_type": "code", "execution_count": null, "id": "de5721d8", "metadata": { "id": "de5721d8" }, "outputs": [], "source": [ "optimized_model = optimize_model(\n", " model=model,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", " metric_drop_ths=0.1,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "0fbfe6fa", "metadata": { "id": "0fbfe6fa" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = model(**encoded_input)\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original GPT2: {original_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f89b7e6d", "metadata": { "id": "f89b7e6d" }, "outputs": [], "source": [ "model(**encoded_input)" ] }, { "cell_type": "code", "execution_count": null, "id": "10d17b5c", "metadata": { "id": "10d17b5c" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " final_out = optimized_model(**encoded_input)\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized GPT2 (metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6bf3d1fb", "metadata": { "id": "6bf3d1fb" }, "outputs": [], "source": [ "optimized_model(**encoded_input)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "cell_type": "markdown", "id": "cb234e5e", "metadata": { "id": "cb234e5e" }, "source": [ "Great! Was it easy? How are the results? Do you have any comments?\n", "Share your optimization results and thoughts with our community on Discord, where we chat about Speedster and AI acceleration.\n", "\n", "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n", "\n", "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord." ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "premium", "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "ef331be9", "metadata": { "id": "ef331be9" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f260653a", "metadata": { "id": "f260653a" }, "source": [ "# Accelerate Hugging Face T5 with Speedster\n" ] }, { "cell_type": "markdown", "id": "8bdf3af5", "metadata": { "id": "8bdf3af5" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n", "\n", "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "id": "d527d63b", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d527d63b", "outputId": "57626bac-e458-487f-f4fa-a459627af296" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "id": "cXXh1ifQ13mH", "metadata": { "id": "cXXh1ifQ13mH" }, "source": [ "# Installation" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all" ] }, { "cell_type": "markdown", "id": "73072506", "metadata": { "id": "73072506" }, "source": [ "## Model and Dataset setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "cf24c4c4", "metadata": {}, "source": [ "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" ] }, { "cell_type": "code", "execution_count": null, "id": "1cf8ff74", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", "\n", "if os.path.exists(tensorrt_path):\n", " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", "else:\n", " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" ] }, { "cell_type": "markdown", "id": "e4d55115", "metadata": { "id": "e4d55115" }, "source": [ "We chose T5-efficient-base as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub." ] }, { "cell_type": "code", "execution_count": null, "id": "NOgOmfdY_dav", "metadata": { "id": "NOgOmfdY_dav" }, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n", "import torch\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "model_name = \"google/t5-efficient-base\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torchscript=True).to(device)\n", "\n", "# set the model to eval mode\n", "_ = model.eval()" ] }, { "cell_type": "markdown", "id": "11aa0739", "metadata": { "id": "11aa0739" }, "source": [ "Let's create an example dataset with some random sentences" ] }, { "cell_type": "code", "execution_count": null, "id": "ghGcDNFtKt3X", "metadata": { "id": "ghGcDNFtKt3X" }, "outputs": [], "source": [ "texts = [\n", " \"\"\"BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.\"\"\",\n", " \"\"\"GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was trained to guess the next word in sentences.\"\"\",\n", " \"\"\"With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task.\"\"\",\n", " \"\"\"LayoutLMv3 is a pre-trained multimodal Transformer for Document AI with unified text and image masking. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model. For example, LayoutLMv3 can be fine-tuned for both text-centric tasks, including form understanding, receipt understanding, and document visual question answering, and image-centric tasks such as document image classification and document layout analysis.\"\"\",\n", " \"\"\"XLNet is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs Transformer-XL as the backbone model, exhibiting excellent performance for language tasks involving long context. Overall, XLNet achieves state-of-the-art (SOTA) results on various downstream language tasks including question answering, natural language inference, sentiment analysis, and document ranking.\"\"\"\n", "]\n", "texts = texts*20" ] }, { "cell_type": "code", "execution_count": null, "id": "a09f9424", "metadata": { "id": "a09f9424" }, "outputs": [], "source": [ "encoded_inputs = [tokenizer(text, padding=\"longest\", return_tensors=\"pt\") for text in texts]" ] }, { "cell_type": "markdown", "id": "17040431", "metadata": { "id": "17040431" }, "source": [ "## Speed up inference with Speedster: no metric drop" ] }, { "cell_type": "markdown", "id": "44ddc21d", "metadata": { "id": "44ddc21d" }, "source": [ "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`." ] }, { "cell_type": "code", "execution_count": null, "id": "f9d934f6", "metadata": { "id": "f9d934f6" }, "outputs": [], "source": [ "from speedster import optimize_model, save_model, load_model" ] }, { "cell_type": "markdown", "id": "76248033", "metadata": { "id": "76248033" }, "source": [ "Usually Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. But for this type of models, we need to do some extra steps because current version of speedster don't have direct support for Encoder-Decoder Models. These type of models has both Encoder and Decoder. For Example, BERT models are Encoder models and GPT models are Decoder models, but T5 has both." ] }, { "cell_type": "code", "execution_count": null, "id": "i7sgUWjePN9i", "metadata": { "id": "i7sgUWjePN9i" }, "outputs": [], "source": [ "# First, we get the encoder and decoder from the model\n", "encoder = model.get_encoder()\n", "decoder = model.get_decoder()" ] }, { "cell_type": "markdown", "id": "O7xaI1drQOQ0", "metadata": { "id": "O7xaI1drQOQ0" }, "source": [ "Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape." ] }, { "cell_type": "code", "execution_count": null, "id": "nTUPdDchQLc1", "metadata": { "id": "nTUPdDchQLc1" }, "outputs": [], "source": [ "dynamic_info = {\n", " \"inputs\": [\n", " {0: 'batch', 1: 'num_tokens'},\n", " {0: 'batch', 1: 'num_tokens'}\n", " ],\n", " \"outputs\": [\n", " {0: 'batch', 1: 'num_tokens'},\n", " ]\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "zPC_EDwEJIM0", "metadata": { "id": "zPC_EDwEJIM0" }, "outputs": [], "source": [ "# Create the optimized encoder model seperately\n", "optimized_encoder_model = optimize_model(\n", " model=encoder,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "7Oa68a87Qjre", "metadata": { "id": "7Oa68a87Qjre" }, "outputs": [], "source": [ "# Create the optimized decoder model seperately\n", "optimized_decoder_model = optimize_model(\n", " model=decoder,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "98c6ab09", "metadata": { "id": "98c6ab09" }, "outputs": [], "source": [ "import time\n", "\n", "# Move inputs to gpu if available\n", "encoded_inputs = [tokenizer(text, padding=\"longest\", return_tensors=\"pt\").to(device) for text in texts]" ] }, { "cell_type": "markdown", "id": "6e5b3b21", "metadata": { "id": "6e5b3b21" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the original model." ] }, { "cell_type": "code", "execution_count": null, "id": "d3bc5c98", "metadata": { "id": "d3bc5c98" }, "outputs": [], "source": [ "times = []\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " encoder_out = encoder(**encoded_input)\n", " decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " encoder_out = encoder(**encoded_input)\n", " decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original T5: {original_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "GU0SwykMTVAj", "metadata": { "id": "GU0SwykMTVAj" }, "source": [ "In Real world use cases, we pass the decoder output to `model.lm_head` to get the actual prediction, but here we are testing the performance improvements, so i am skipping that step." ] }, { "cell_type": "markdown", "id": "12c2df98", "metadata": { "id": "12c2df98" }, "source": [ "Let's see the output of the original model" ] }, { "cell_type": "code", "execution_count": null, "id": "4892a905", "metadata": { "id": "4892a905" }, "outputs": [], "source": [ "encoder(**encoded_input)" ] }, { "cell_type": "code", "execution_count": null, "id": "gx0naPVuSVrm", "metadata": { "id": "gx0naPVuSVrm" }, "outputs": [], "source": [ "decoder(**encoded_input,encoder_hidden_states=encoder_out[0])" ] }, { "cell_type": "markdown", "id": "3db0a7a1", "metadata": { "id": "3db0a7a1" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the optimized model." ] }, { "cell_type": "code", "execution_count": null, "id": "a3e83997", "metadata": { "id": "a3e83997" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " encoder_out = optimized_encoder_model(**encoded_input)\n", " decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " encoder_out = optimized_encoder_model(**encoded_input)\n", " decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized T5 (no metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "0d884d61", "metadata": { "id": "0d884d61" }, "source": [ "Let's see the output of the optimized_model" ] }, { "cell_type": "code", "execution_count": null, "id": "75611b2e", "metadata": { "id": "75611b2e" }, "outputs": [], "source": [ "optimized_encoder_model(**encoded_input)" ] }, { "cell_type": "code", "execution_count": null, "id": "cpieoDfwS-V7", "metadata": { "id": "cpieoDfwS-V7" }, "outputs": [], "source": [ "optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])" ] }, { "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Speed up inference with Speedster: metric drop" ] }, { "cell_type": "markdown", "id": "7b1950d5", "metadata": { "id": "7b1950d5" }, "source": [ "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup" ] }, { "cell_type": "code", "execution_count": null, "id": "VwOLWZSZUM89", "metadata": { "id": "VwOLWZSZUM89" }, "outputs": [], "source": [ "optimized_encoder_model = optimize_model(\n", " model=encoder,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", " metric_drop_ths=0.1,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "FIKn4V3dUIZB", "metadata": { "id": "FIKn4V3dUIZB" }, "outputs": [], "source": [ "optimized_decoder_model = optimize_model(\n", " model=decoder,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n", " dynamic_info=dynamic_info,\n", " metric_drop_ths=0.1,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "0fbfe6fa", "metadata": { "id": "0fbfe6fa" }, "outputs": [], "source": [ "times = []\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " encoder_out = encoder(**encoded_input)\n", " decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " encoder_out = encoder(**encoded_input)\n", " decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original T5: {original_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f89b7e6d", "metadata": { "id": "f89b7e6d" }, "outputs": [], "source": [ "encoder(**encoded_input)" ] }, { "cell_type": "code", "execution_count": null, "id": "oI1zjIBSUoIU", "metadata": { "id": "oI1zjIBSUoIU" }, "outputs": [], "source": [ "decoder(**encoded_input,encoder_hidden_states=encoder_out[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "10d17b5c", "metadata": { "id": "10d17b5c" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " with torch.no_grad():\n", " encoder_out = optimized_encoder_model(**encoded_input)\n", " decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " with torch.no_grad():\n", " encoder_out = optimized_encoder_model(**encoded_input)\n", " decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized T5 (metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "4XFMC1S6zXTU", "metadata": { "id": "4XFMC1S6zXTU" }, "source": [ "## Save and reload the optimized model" ] }, { "cell_type": "markdown", "id": "OXHVr3EAzbT5", "metadata": { "id": "OXHVr3EAzbT5" }, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "3M565P-zzaFB", "metadata": { "id": "3M565P-zzaFB" }, "outputs": [], "source": [ "save_model(optimized_encoder_model, \"encoder_model_save_path\")\n", "save_model(optimized_decoder_model, \"decoder_model_save_path\")" ] }, { "cell_type": "markdown", "id": "ee8CS_Evzg1j", "metadata": { "id": "ee8CS_Evzg1j" }, "source": [ "We can then load again the model:\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "zOQ88SY_zg-A", "metadata": { "id": "zOQ88SY_zg-A" }, "outputs": [], "source": [ "optimized_encoder_model = load_model(\"encoder_model_save_path\")\n", "optimized_decoder_model = load_model(\"decoder_model_save_path\")" ] }, { "cell_type": "markdown", "id": "cb234e5e", "metadata": { "id": "cb234e5e" }, "source": [ "Great! Was it easy? How are the results? Do you have any comments?\n", "Share your optimization results and thoughts with our community on Discord, where we chat about Speedster and AI acceleration.\n", "\n", "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n", "\n", "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord." ] }, { "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "provenance": [] }, "gpuClass": "premium", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb ================================================ { "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "ef331be9", "metadata": { "id": "ef331be9" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "cell_type": "markdown", "id": "f260653a", "metadata": { "id": "f260653a" }, "source": [ "# Accelerate Hugging Face TensorFlow BERT with Speedster\n" ] }, { "cell_type": "markdown", "id": "8bdf3af5", "metadata": { "id": "8bdf3af5" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n", "\n", "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "id": "d527d63b", "metadata": { "id": "d527d63b" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "id": "cXXh1ifQ13mH", "metadata": { "id": "cXXh1ifQ13mH" }, "source": [ "# Installation" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all" ] }, { "cell_type": "markdown", "id": "73072506", "metadata": { "id": "73072506" }, "source": [ "## Model and Dataset setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "cf24c4c4", "metadata": {}, "source": [ "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" ] }, { "cell_type": "code", "execution_count": null, "id": "1cf8ff74", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", "\n", "if os.path.exists(tensorrt_path):\n", " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", "else:\n", " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" ] }, { "cell_type": "markdown", "id": "e4d55115", "metadata": { "id": "e4d55115" }, "source": [ "We chose BERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub." ] }, { "cell_type": "code", "execution_count": null, "id": "d633cf21", "metadata": { "id": "d633cf21", "scrolled": true }, "outputs": [], "source": [ "from transformers import BertTokenizer, TFBertModel\n", "\n", "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", "model = TFBertModel.from_pretrained('bert-base-uncased')" ] }, { "cell_type": "markdown", "id": "11aa0739", "metadata": { "id": "11aa0739" }, "source": [ "Let's create an example dataset with some random sentences" ] }, { "cell_type": "code", "execution_count": null, "id": "cbbfeeb2", "metadata": { "id": "cbbfeeb2" }, "outputs": [], "source": [ "import random\n", "\n", "sentences = [\n", " \"Mars is the fourth planet from the Sun.\",\n", " \"has a crust primarily composed of elements\",\n", " \"However, it is unknown\",\n", " \"can be viewed from Earth\",\n", " \"It was the Romans\",\n", "]\n", "\n", "len_dataset = 100\n", "\n", "texts = []\n", "for _ in range(len_dataset):\n", " n_times = random.randint(1, 30)\n", " texts.append(\" \".join(random.choice(sentences) for _ in range(n_times)))" ] }, { "cell_type": "code", "execution_count": null, "id": "a09f9424", "metadata": { "id": "a09f9424" }, "outputs": [], "source": [ "encoded_inputs = [tokenizer(text, return_tensors=\"tf\") for text in texts]" ] }, { "cell_type": "markdown", "id": "17040431", "metadata": { "id": "17040431" }, "source": [ "## Speed up inference with Speedster: no metric drop" ] }, { "cell_type": "markdown", "id": "44ddc21d", "metadata": { "id": "44ddc21d" }, "source": [ "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`." ] }, { "cell_type": "code", "execution_count": null, "id": "f9d934f6", "metadata": { "id": "f9d934f6" }, "outputs": [], "source": [ "from speedster import optimize_model, save_model, load_model" ] }, { "cell_type": "markdown", "id": "76248033", "metadata": { "id": "76248033" }, "source": [ "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape." ] }, { "cell_type": "code", "execution_count": null, "id": "zPC_EDwEJIM0", "metadata": { "id": "zPC_EDwEJIM0" }, "outputs": [], "source": [ "dynamic_info = {\n", " \"inputs\": [\n", " {0: 'batch', 1: 'num_tokens'},\n", " {0: 'batch', 1: 'num_tokens'},\n", " {0: 'batch', 1: 'num_tokens'},\n", " ],\n", " \"outputs\": [\n", " {0: \"batch\", 1: \"num_tokens\"},\n", " {0: \"batch\"}\n", " ]\n", "}\n", "\n", "optimized_model = optimize_model(\n", " model=model,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " ignore_compilers=[\"tvm\"],\n", " dynamic_info=dynamic_info,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "98c6ab09", "metadata": { "id": "98c6ab09" }, "outputs": [], "source": [ "import time\n", "\n", "encoded_inputs = [tokenizer(text, return_tensors=\"tf\") for text in texts]" ] }, { "cell_type": "markdown", "id": "6e5b3b21", "metadata": { "id": "6e5b3b21" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the original model." ] }, { "cell_type": "code", "execution_count": null, "id": "d3bc5c98", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d3bc5c98", "outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " final_out = model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " final_out = model(**encoded_input)\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original DistilBERT: {original_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "12c2df98", "metadata": { "id": "12c2df98" }, "source": [ "Let's see the output of the original model" ] }, { "cell_type": "code", "execution_count": null, "id": "4892a905", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4892a905", "outputId": "68d9b65f-e2cc-4998-8047-c9091f977698" }, "outputs": [], "source": [ "model(**encoded_input)" ] }, { "cell_type": "markdown", "id": "3db0a7a1", "metadata": { "id": "3db0a7a1" }, "source": [ "Let's run the prediction 100 times to calculate the average response time of the optimized model." ] }, { "cell_type": "code", "execution_count": null, "id": "a3e83997", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a3e83997", "outputId": "7a416b14-f170-4df9-d416-026f06a7d980" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " final_out = optimized_model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " final_out = optimized_model(**encoded_input)\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized BERT (no metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "markdown", "id": "0d884d61", "metadata": { "id": "0d884d61" }, "source": [ "Let's see the output of the optimized_model" ] }, { "cell_type": "code", "execution_count": null, "id": "75611b2e", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "75611b2e", "outputId": "035d5c6d-fd7a-4506-af09-befcf9dd3b2d" }, "outputs": [], "source": [ "optimized_model(**encoded_input)" ] }, { "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Speed up inference with Speedster: metric drop" ] }, { "cell_type": "markdown", "id": "7b1950d5", "metadata": { "id": "7b1950d5" }, "source": [ "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup" ] }, { "cell_type": "code", "execution_count": null, "id": "de5721d8", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "de5721d8", "outputId": "c9efff21-f963-47ff-e83d-a44615f90a10" }, "outputs": [], "source": [ "optimized_model = optimize_model(\n", " model=model,\n", " input_data=encoded_inputs,\n", " optimization_time=\"constrained\",\n", " dynamic_info=dynamic_info,\n", " ignore_compilers=[\"tvm\"],\n", " metric_drop_ths=0.1,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "0fbfe6fa", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0fbfe6fa", "outputId": "ada293f5-9b54-4186-8e48-74b994d4b797" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " final_out = model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " final_out = model(**encoded_input)\n", " times.append(time.time()-st)\n", "original_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for original BERT: {original_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f89b7e6d", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f89b7e6d", "outputId": "51e497e1-a533-432d-d68e-b373f0ef69cb" }, "outputs": [], "source": [ "model(**encoded_input)" ] }, { "cell_type": "code", "execution_count": null, "id": "10d17b5c", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "10d17b5c", "outputId": "d5dc0acd-77e7-4054-b455-19343ff37951" }, "outputs": [], "source": [ "times = []\n", "\n", "# Warmup for 30 iterations\n", "for encoded_input in encoded_inputs[:30]:\n", " final_out = optimized_model(**encoded_input)\n", "\n", "# Benchmark\n", "for encoded_input in encoded_inputs:\n", " st = time.time()\n", " final_out = optimized_model(**encoded_input)\n", " times.append(time.time()-st)\n", "optimized_model_time = sum(times)/len(times)*1000\n", "print(f\"Average response time for optimized BERT (metric drop): {optimized_model_time} ms\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6bf3d1fb", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6bf3d1fb", "outputId": "6163d8ba-254f-47d2-a468-a921622a15ba" }, "outputs": [], "source": [ "optimized_model(**encoded_input)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "cell_type": "markdown", "id": "cb234e5e", "metadata": { "id": "cb234e5e" }, "source": [ "Great! Was it easy? How are the results? Do you have any comments?\n", "Share your optimization results and thoughts with our community on Discord, where we chat about Speedster and AI acceleration.\n", "\n", "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n", "\n", "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord." ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "premium", "kernelspec": { "display_name": "Python 3.9.15 ('nebullvm_new')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:52:10) \n[Clang 14.0.6 ]" }, "vscode": { "interpreter": { "hash": "4fbc45cd27f7d363500c2e8640d9fdb717da4e1d8e4954a68e42b53d65ee27af" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/huggingface/Readme.md ================================================ # **Hugging Face Optimization** This section contains all the available notebooks that show how to leverage Speedster to optimize Hugging Face models. Hugging Face hosts models that can use either PyTorch or TensorFlow as backend. Both the backends are supported by Speedster. ## Notebooks: | Notebook | Description | | |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Accelerate Hugging Face PyTorch GPT2](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb) | Show how to optimize with Speedster the GPT2 model from Hugging Face with PyTorch backend. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb) | | [Accelerate Hugging Face PyTorch BERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb) | Show how to optimize with Speedster the BERT model from Hugging Face with PyTorch backend. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb) | | [Accelerate Hugging Face PyTorch DistilBERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb) | Show how to optimize with Speedster the DistilBERT model from Hugging Face with PyTorch backend. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb) | | | [Accelerate Hugging Face TensorFlow BERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb) | Show how to optimize with Speedster the BERT model from Hugging Face with TensorFlow backend. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb) | | [Accelerate Hugging Face PyTorch T5](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb) | Show how to optimize with Speedster the T5 model from Hugging Face with PyTorch backend. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb) | ## Hugging Face API quick view: ``` python from speedster import optimize_model from transformers import AlbertModel, AlbertTokenizer # Load Albert as example model = AlbertModel.from_pretrained("albert-base-v1") tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") # Case 1: dictionary input format text = "This is an example text for the huggingface model." input_dict = tokenizer(text, return_tensors="pt") # set return_tensors="tf" or "np" for tensorflow models # Run Speedster optimization optimized_model = optimize_model( model, input_data=[input_dict] ) ## Warmup the model ## This step is necessary before the latency computation of the ## optimized model in order to get reliable results. # for _ in range(10): # optimized_model(**input_dict) # Try the optimized model res = optimized_model(**input_dict) # # Case 2: strings input format # input_data = [ # "This is a test.", # "Hi my name is John.", # "The cat is on the table.", # ] # tokenizer_args = dict( # return_tensors="pt", # set return_tensors="tf" or "np" for tensorflow models # padding="longest", # truncation=True, # ) # # # Run Speedster optimization # optimized_model = optimize_model( # model, input_data=input_data, tokenizer=tokenizer, tokenizer_args=tokenizer_args # ) ``` ================================================ FILE: optimization/speedster/notebooks/huggingface/faster_transformer_bert.py ================================================ # %% import logging import random import time import speedster import torch from speedster import optimize_model # %% from nebullvm.operations.optimizations.compilers.faster_transformer.bert import ( # noqa: E501 detect_and_swap_bert_model, ) # %% from nebullvm.operations.optimizations.compilers.utils import ( get_faster_transformer_repo_path, ) from transformers import BertTokenizer from transformers.models.bert.modeling_bert import ( BertForSequenceClassification as HFBertForSequenceClassification, ) # %% print(speedster.__file__) lib_path = str( get_faster_transformer_repo_path() / "build" / "lib" / "libth_transformer.so" ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # %% # https://huggingface.co/bert-base-cased-finetuned-mrpc # %% def prepare_examples(tokenizer, len_dataset=1000): sentences = [ "Mars is the fourth planet from the Sun.", "has a crust primarily composed of elements", "However, it is unknown", "can be viewed from Earth", "It was the Romans", ] texts = [] for _ in range(len_dataset): n_times = random.randint(1, 30) texts.append( " ".join(random.choice(sentences) for _ in range(n_times)) ) encoded_inputs = [ tokenizer(text, return_tensors="pt", truncation=True).to(device) for text in texts ] len(encoded_inputs), encoded_inputs[0].keys() fake_input_id = torch.LongTensor(per_gpu_eval_batch_size, max_seq_length) fake_input_id.fill_(1) fake_input_id = fake_input_id.to(device) fake_mask = torch.ones(per_gpu_eval_batch_size, max_seq_length).to(device) fake_type_id = fake_input_id.clone().detach() if data_type == "fp16": fake_mask = fake_mask.half() elif data_type == "bf16": fake_mask = fake_mask.bfloat16() return encoded_inputs, fake_input_id, fake_mask, fake_type_id # %% logger = logging.getLogger(__name__) use_ths = use_torchscript = False remove_padding = False data_type = "fp16" # "fp32", "fp16", "bf16" per_gpu_eval_batch_size = 1 max_seq_length = 128 model_name_or_path = "bert-base-cased-finetuned-mrpc" model = HFBertForSequenceClassification.from_pretrained( model_name_or_path, torchscript=True ) model.eval().to(device) tokenizer = BertTokenizer.from_pretrained(model_name_or_path) encoded_inputs, fake_input_id, fake_mask, fake_type_id = prepare_examples( tokenizer ) def optimize_no_trace(model, data_type="fp16"): model = detect_and_swap_bert_model( model, data_type="fp16", lib_path=lib_path, remove_padding=False ) if data_type == "fp16": logger.info("Use fp16") model.half() elif data_type == "bf16": logger.info("Use bf16") model.bfloat16() return model.to(device) def optimize_with_trace( model, data_type, per_gpu_eval_batch_size, max_seq_length ): model = optimize_no_trace(model, data_type) logger.info("Use TorchScript mode") fake_input_id = torch.LongTensor(per_gpu_eval_batch_size, max_seq_length) fake_input_id.fill_(1) fake_input_id = fake_input_id.to(device) fake_mask = torch.ones(per_gpu_eval_batch_size, max_seq_length).to(device) fake_type_id = fake_input_id.clone().detach() if data_type == "fp16": fake_mask = fake_mask.half() elif data_type == "bf16": fake_mask = fake_mask.bfloat16() model.eval() with torch.no_grad(): model_ = torch.jit.trace( model, (fake_input_id, fake_mask, fake_type_id) ) return model_ def benchmark(model, model_desc="original BERT"): times = [] # Warmup for 30 iterations for encoded_input in encoded_inputs[:30]: with torch.no_grad(): _ = model(**encoded_input) # Benchmark for encoded_input in encoded_inputs: st = time.perf_counter() with torch.no_grad(): _ = model(**encoded_input) times.append(time.perf_counter() - st) original_model_time = sum(times) / len(times) * 1000 print(f"Average response time for {model_desc}: {original_model_time} ms") print(f"{encoded_inputs[0].keys()}") benchmark(model, "BERT") benchmark(model, "BERT") data_type = "fp16" # "fp32", "fp16", "bf16 per_gpu_eval_batch_size = 1 max_seq_length = 128 faster_model = optimize_no_trace(model, data_type) benchmark(faster_model, "faster BERT (no metric drop)") # Average response time for BERT: 4.741025467636064 ms # Average response time for BERT: 4.686204055091366 ms fastest_model = optimize_with_trace( model, data_type, per_gpu_eval_batch_size, max_seq_length ) benchmark(fastest_model, "fastest BERT (no metric drop)") # Average response time for faster BERT (no metric drop): 1.5583459960762411 ms # noqa: E501 # the above operations modifies `model` in-place # so we need reload a fresh one to test speedster model = HFBertForSequenceClassification.from_pretrained( model_name_or_path, torchscript=True ) # Average response time for fastest BERT (no metric drop): 1.4657320715487003 ms # noqa: E501 model.eval().to(device) dynamic_info = { "inputs": [ {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, ], "outputs": [{0: "batch", 1: "num_tokens"}], } speedster_optimized_model = optimize_model( model=model, input_data=encoded_inputs, optimization_time="constrained", # force it to use fastertransformer ignore_compilers=["tensor_rt", "tvm", "onnxruntime", "torchscript"], dynamic_info=dynamic_info, ) benchmark( speedster_optimized_model, "speedster optimized BERT (no metric drop)" ) benchmark( speedster_optimized_model, "speedster optimized BERT (no metric drop)" ) # Average response time for speedster optimized BERT (no metric drop): 14.040142675396055 ms # noqa: E501 # Average response time for speedster optimized BERT (no metric drop): 3.4986357542220503 ms # noqa: E501 speedster_optimized_model_fp16 = optimize_model( model=model, input_data=encoded_inputs, optimization_time="constrained", # force it to use fastertransformer ignore_compilers=["tensor_rt", "tvm", "onnxruntime", "torchscript"], dynamic_info=dynamic_info, metric_drop_ths=0.1, ) benchmark( speedster_optimized_model_fp16, "speedster optimized BERT (metric drop)" ) benchmark( speedster_optimized_model_fp16, "speedster optimized BERT (metric drop)" ) # Average response time for speedster optimized BERT (no metric drop): 14.040142675396055 ms # noqa: E501 # Average response time for speedster optimized BERT (no metric drop): 3.4986357542220503 ms # noqa: E501 ================================================ FILE: optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "p5b0PzpW1xJq" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Accelerate ONNX ResNet50 with Speedster" ] }, { "cell_type": "markdown", "metadata": { "id": "T9xuwZEHzN2K" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library `nebullvm`.\n", "\n", "We will\n", "1. Install Speedster and the deep learning compilers used by the library.\n", "2. Speed up an ONNX ResNet50 without any loss of accuracy.\n", "3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5Yc5KYo_YzE8" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "metadata": { "id": "HbFy2Aykz2Qo" }, "source": [ "# Installation" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks onnx --compilers all" ] }, { "cell_type": "markdown", "metadata": { "id": "N5RXHoZl0p3p" }, "source": [ "# Optimization example with ONNX" ] }, { "cell_type": "markdown", "metadata": { "id": "-Ju-VcRH01Mw" }, "source": [ "In the following example we will try to optimize a standard ONNX resnet50.\n", "\n", "Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://nebuly.gitbook.io/nebuly/nebullvm/get-started).\n", "\n", "Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \"accuracy\", metric_drop_ths = 0.02)." ] }, { "cell_type": "markdown", "metadata": { "id": "skxEuemn171G" }, "source": [ "## Scenario 1 - No accuracy drop" ] }, { "cell_type": "markdown", "metadata": { "id": "wVRLXrDi2VaG" }, "source": [ "First of all we download the pretrained ONNX resnet50 model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6I5GDvWbZ-LJ", "outputId": "6ac09b39-9c6e-4d38-dfb6-35069938f9c1" }, "outputs": [], "source": [ "!wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx" ] }, { "cell_type": "markdown", "metadata": { "id": "vrkOvGfkaXk7" }, "source": [ "Then we optimize it with Speedster simple API" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2RbgGruAeQcf" }, "outputs": [], "source": [ "import numpy as np\n", "from speedster import optimize_model, save_model, load_model\n", "\n", "# Load a resnet as example\n", "model = \"resnet50-v1-12.onnx\"\n", "\n", "# Provide an input data for the model \n", "input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0]))]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\"\n", ")\n", "\n", "# Try the optimized model\n", "x = np.random.randn(1, 3, 224, 224).astype(np.float32)\n", "res_optimized = optimized_model(x)" ] }, { "cell_type": "markdown", "metadata": { "id": "i2IKNc2jbax8" }, "source": [ "We can print the type of the optimized model to see which compiler was faster:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dFhqAhr0bcbZ", "outputId": "aa0b2fe9-2fa0-405b-8e44-3ebbf70f0e69" }, "outputs": [], "source": [ "optimized_model" ] }, { "cell_type": "markdown", "metadata": { "id": "_UuiqkEfcPy4" }, "source": [ "In our case, the optimized model type was NumpyONNXInferenceLearner, so this means that onnxruntime was the faster compiler.\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "E4759DQJcc15" }, "source": [ "After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement" ] }, { "cell_type": "markdown", "metadata": { "id": "ktQaNfGqceOD" }, "source": [ "First of all, let's compute and print the original model result\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gUMlNAZrcj5-", "outputId": "3670f41f-b2db-4b55-dbf7-c9b0a0146c9d" }, "outputs": [], "source": [ "import onnx\n", "import onnxruntime as ort\n", "from typing import Dict, List\n", "\n", "\n", "def get_input_names(onnx_model: str):\n", " model = onnx.load(onnx_model)\n", " input_all = [node.name for node in model.graph.input]\n", " return input_all\n", "\n", "\n", "def get_output_names(onnx_model: str):\n", " model = onnx.load(onnx_model)\n", " output_all = [node.name for node in model.graph.output]\n", " return output_all\n", "\n", "\n", "def run_onnx_model(\n", " onnx_model: str, session: ort.InferenceSession, input_tensors: List[np.ndarray], inputs: Dict, output_names: str\n", ") -> List[np.ndarray]:\n", " \n", " res = session.run(\n", " output_names=output_names, input_feed=inputs\n", " )\n", " return list(res)\n", "\n", "\n", "session = ort.InferenceSession(\n", " model,\n", " providers=[\"CUDAExecutionProvider\", \"CPUExecutionProvider\"] # Change to [\"CPUExecutionProvider\"] if run on cpu\n", ")\n", "\n", "inputs = {\n", " name: array\n", " for name, array in zip(get_input_names(model), [x])\n", "}\n", "\n", "res_original = run_onnx_model(model, session, [x], inputs, get_output_names(model))\n", "res_original" ] }, { "cell_type": "markdown", "metadata": { "id": "iU3dPwSTfWr_" }, "source": [ "Then, let's print the optimized model result that we computed before" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "S1EKoJ75fVAh", "outputId": "73e7b127-e7d3-44a9-bd78-65961bd051df" }, "outputs": [], "source": [ "res_optimized" ] }, { "cell_type": "markdown", "metadata": { "id": "Lj4crPMmf_LX" }, "source": [ "Then, let's compute the average latency of the baseline model:\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rGNKr_ShgBbu" }, "outputs": [], "source": [ "import time" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "I2G4OzhCgG_D", "outputId": "a23eb4ea-fa0f-4221-a177-20876e452b53" }, "outputs": [], "source": [ "num_iters = 100\n", "\n", "# Warmup\n", "for i in range(10):\n", " run_onnx_model(model, session, [x], inputs, get_output_names(model))\n", "\n", "start = time.time()\n", "for i in range(num_iters):\n", " run_onnx_model(model, session, [x], inputs, get_output_names(model))\n", "stop = time.time()\n", "\n", "print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))" ] }, { "cell_type": "markdown", "metadata": { "id": "f-jmRjJvgW5V" }, "source": [ "Finally we compute the average latency for the optimized model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "51c3uaMcgaR-", "outputId": "1319a7bc-df1d-4f19-9426-3940ab4a7c5e" }, "outputs": [], "source": [ "# Warmup\n", "for i in range(10):\n", " optimized_model(x)\n", "\n", "start = time.time()\n", "for i in range(num_iters):\n", " optimized_model(x)\n", "stop = time.time()\n", "\n", "print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))" ] }, { "cell_type": "markdown", "metadata": { "id": "tBeRKNTI3iyK" }, "source": [ "## Scenario 2 - Accuracy drop" ] }, { "cell_type": "markdown", "metadata": { "id": "w3wutIzfAMe_" }, "source": [ "In this scenario, we set a max threshold for the accuracy drop to 2%" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fO1nGqpj3p7z" }, "outputs": [], "source": [ "import numpy as np\n", "from speedster import optimize_model\n", "\n", "# Load a resnet as example\n", "model = \"resnet50-v1-12.onnx\"\n", "\n", "# Provide an input data for the model\n", "# Note that in this case we should provide the model at least 100 data samples\n", "input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for i in range(100)]\n", "\n", "# Run nebullvm optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\", metric = \"accuracy\", metric_drop_ths = 0.02\n", ")\n", "\n", "# Try the optimized model\n", "x = np.random.randn(1, 3, 224, 224).astype(np.float32)\n", "res_optimized = optimized_model(x)" ] }, { "cell_type": "markdown", "metadata": { "id": "4UFtwZbEiLv3" }, "source": [ "Here we compute the average throughput for the baseline model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qFKHaHM6-GKm", "outputId": "73b95996-4d1f-4aa7-a96d-a40070bf36bd" }, "outputs": [], "source": [ "num_iters = 100\n", "\n", "# Warmup\n", "for i in range(10):\n", " run_onnx_model(model, session, [x], inputs, get_output_names(model))\n", "\n", "start = time.time()\n", "for i in range(num_iters):\n", " run_onnx_model(model, session, [x], inputs, get_output_names(model))\n", "stop = time.time()\n", "\n", "print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))" ] }, { "cell_type": "markdown", "metadata": { "id": "J8g0aJRJiXA5" }, "source": [ "Here we compute the average throughput for the optimized model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_IbAW0KA4Fm5", "outputId": "67f44401-9568-4f38-802a-d81e3139af5a" }, "outputs": [], "source": [ "# Warmup\n", "for i in range(10):\n", " optimized_model(x)\n", "\n", "start = time.time()\n", "for i in range(num_iters):\n", " optimized_model(x)\n", "stop = time.time()\n", "\n", "print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/onnx/Readme.md ================================================ # **ONNX Optimization** This section contains all the available notebooks that show how to leverage Speedster to optimize ONNX models. ## Notebooks: | Notebook | Description | | |:--------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Accelerate ONNX Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model in ONNX format. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb) | ## ONNX API quick view: ```python import numpy as np from speedster import optimize_model # Load a resnet as example # Model was downloaded from here: # https://github.com/onnx/models/tree/main/vision/classification/resnet model = "resnet50-v1-12.onnx" # Provide an input data for the model input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0]))] # Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="unconstrained" ) # Try the optimized model x = np.random.randn(1, 3, 224, 224).astype(np.float32) ## Warmup the model ## This step is necessary before the latency computation of the ## optimized model in order to get reliable results. # for _ in range(10): # optimized_model(x) res_optimized = optimized_model(x) ``` ================================================ FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "p5b0PzpW1xJq" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Accelerate PyTorch ResNet50 with Speedster" ] }, { "cell_type": "markdown", "metadata": { "id": "T9xuwZEHzN2K" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using Speedster app from the open-source library `nebullvm`.\n", "\n", "We will\n", "1. Install Speedster and the deep learning compilers used by the library.\n", "2. Speed up a PyTorch ResNet50 without any loss of accuracy.\n", "3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_0ZRCXCR9693", "outputId": "19096862-5c5c-4f9f-b2ad-3ce084ccf213" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "metadata": { "id": "HbFy2Aykz2Qo" }, "source": [ "### Installation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZPJHVZ74d8r2" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "metadata": { "id": "b0CLgQqxyrQi" }, "source": [ "Let's now import install the deep learning compilers used by Speedster that are not yet installed on the hardware.\n", "\n", "The installation of the compilers may take a few minutes." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GvK9mZSjeLU5" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all" ] }, { "cell_type": "markdown", "metadata": { "id": "N5RXHoZl0p3p" }, "source": [ "## Optimization example with Pytorch" ] }, { "cell_type": "markdown", "metadata": { "id": "-Ju-VcRH01Mw" }, "source": [ "In the following example we will try to optimize a standard resnet50 loaded directly from torchvision.\n", "\n", "Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\n", "\n", "Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \"accuracy\", metric_drop_ths = 0.02)." ] }, { "cell_type": "markdown", "metadata": { "id": "skxEuemn171G" }, "source": [ "### Scenario 1 - No accuracy drop" ] }, { "cell_type": "markdown", "metadata": { "id": "wVRLXrDi2VaG" }, "source": [ "First we load the model and optimize it using the Speedster API:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2RbgGruAeQcf" }, "outputs": [], "source": [ "import torch\n", "import torchvision.models as models\n", "from speedster import optimize_model, save_model, load_model\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# Load a resnet as example\n", "model = models.resnet50().to(device)\n", "\n", "# Provide an input data for the model \n", "input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\"\n", ")\n", "\n", "# Try the optimized model\n", "x = torch.randn(1, 3, 256, 256).to(device)\n", "model.eval()\n", "res_optimized = optimized_model(x)\n", "res_original = model(x)" ] }, { "cell_type": "markdown", "metadata": { "id": "JMiuufyu2gD3" }, "source": [ "We can print the type of the optimized model to see which compiler was faster:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ifuLyQsM9697", "outputId": "c1534e0d-e5bb-4d44-91e9-652593751d52" }, "outputs": [], "source": [ "optimized_model" ] }, { "cell_type": "markdown", "metadata": { "id": "4WxcxrUC9698" }, "source": [ "In our case, the optimized model type was PytorchTensorRTInferenceLearner, so this means that Pytorch-TensorRT was the faster compiler." ] }, { "cell_type": "markdown", "metadata": { "id": "iwHKfT349698" }, "source": [ "After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement" ] }, { "cell_type": "markdown", "metadata": { "id": "-IMJpfcb9698" }, "source": [ "First of all, let's print the results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uI8Kd1Z49698", "outputId": "832d3053-d6c8-4cc2-9b48-a59dfaa45d33" }, "outputs": [], "source": [ "res_original" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0I_zSpv29698", "outputId": "a0ba566d-6730-4954-8dd0-eb47b549cbf1" }, "outputs": [], "source": [ "res_optimized" ] }, { "cell_type": "markdown", "metadata": { "id": "hBEtrYOd9699" }, "source": [ "Then, let's compare the performances:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GqxiCAbpfcwV" }, "outputs": [], "source": [ "from nebullvm.tools.benchmark import benchmark" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_0b0Bzwq-czD" }, "outputs": [], "source": [ "# Set the model to eval mode and move it to the available device\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "model.eval()\n", "model.to(device)" ] }, { "cell_type": "markdown", "metadata": { "id": "UqxzStjD2v0r" }, "source": [ "Here we compute the average throughput for the baseline model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dkt67_Orwlv4", "outputId": "fc10c03c-c3ad-44d4-9fd6-c9b6dc0256c7" }, "outputs": [], "source": [ "benchmark(model, input_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "AgOv-GqQ3KIC" }, "source": [ "Here we compute the average throughput for the optimized model:\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4PodpaDVfwzT", "outputId": "27a42560-93a2-4c19-e68d-360093fe914c" }, "outputs": [], "source": [ "benchmark(optimized_model, input_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "tBeRKNTI3iyK" }, "source": [ "## Scenario 2 - Accuracy drop" ] }, { "cell_type": "markdown", "metadata": { "id": "w3wutIzfAMe_" }, "source": [ "In this scenario, we set a max threshold for the accuracy drop to 2%" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fO1nGqpj3p7z" }, "outputs": [], "source": [ "import torch\n", "import torchvision.models as models\n", "from speedster import optimize_model\n", "\n", "# Load a resnet as example\n", "model = models.resnet50().to(device)\n", "\n", "# Provide 100 random input data for the model \n", "input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\", metric=\"accuracy\", metric_drop_ths=0.02\n", ")\n", "\n", "# Try the optimized model\n", "x = torch.randn(1, 3, 256, 256).to(device)\n", "res = optimized_model(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qFKHaHM6-GKm" }, "outputs": [], "source": [ "# Set the model to eval mode and move it to the available device\n", "\n", "model.eval()\n", "model.to(device)" ] }, { "cell_type": "markdown", "metadata": { "id": "yfW9kmHX-pGi" }, "source": [ "Here we compute the average throughput for the baseline model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0MMrL3959hli", "outputId": "2e8d27ec-a9f3-4f70-8c75-a0df974f2653" }, "outputs": [], "source": [ "benchmark(model, input_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "i3GqasOM-u8f" }, "source": [ "Here we compute the average throughput for the optimized model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_IbAW0KA4Fm5", "outputId": "48d83c89-5687-42aa-a3b8-6989bcb66aa6" }, "outputs": [], "source": [ "benchmark(optimized_model, input_data)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "p5b0PzpW1xJq" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Accelerate PyTorch VisionTransformer with Speedster" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "T9xuwZEHzN2K" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using Speedster app from the open-source library `nebullvm`.\n", "\n", "We will\n", "1. Install Speedster and the deep learning compilers used by the library.\n", "2. Speed up a PyTorch ViT without any loss of accuracy.\n", "3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_0ZRCXCR9693", "outputId": "19096862-5c5c-4f9f-b2ad-3ce084ccf213" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "metadata": { "id": "HbFy2Aykz2Qo" }, "source": [ "### Installation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZPJHVZ74d8r2" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "metadata": { "id": "b0CLgQqxyrQi" }, "source": [ "Let's now import install the deep learning compilers used by Speedster that are not yet installed on the hardware.\n", "\n", "The installation of the compilers may take a few minutes." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GvK9mZSjeLU5" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all" ] }, { "cell_type": "markdown", "metadata": { "id": "N5RXHoZl0p3p" }, "source": [ "## Optimization example with Pytorch" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "-Ju-VcRH01Mw" }, "source": [ "In the following example we will try to optimize a ViT model loaded directly from vit_pytorch library.\n", "\n", "Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\n", "\n", "Let's first test the optimization without any loss in accuracy (metric_drop_ths=0, which is the default value), and then attempt to further accelerate it while constraining the loss of accuracy to a maximum of 2% (metric = 'accuracy', metric_drop_ths = 0.02)." ] }, { "cell_type": "markdown", "metadata": { "id": "skxEuemn171G" }, "source": [ "### Scenario 1 - No accuracy drop" ] }, { "cell_type": "markdown", "metadata": { "id": "wVRLXrDi2VaG" }, "source": [ "First we load the model and optimize it using the Speedster API:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2RbgGruAeQcf" }, "outputs": [], "source": [ "import torch\n", "from vit_pytorch import ViT\n", "from speedster import optimize_model, save_model, load_model\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# Load a ViT model\n", "model = ViT(\n", " image_size = 256,\n", " patch_size = 32,\n", " num_classes = 1000,\n", " dim = 1024,\n", " depth = 6,\n", " heads = 16,\n", " mlp_dim = 2048,\n", " dropout = 0.1,\n", " emb_dropout = 0.1\n", ").to(device)\n", "\n", "# Provide an input data for the model \n", "input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\"\n", ")\n", "\n", "# Try the optimized model\n", "x = torch.randn(1, 3, 256, 256).to(device)\n", "model.to(device).eval()\n", "res_optimized = optimized_model(x)\n", "res_original = model(x)" ] }, { "cell_type": "markdown", "metadata": { "id": "JMiuufyu2gD3" }, "source": [ "We can print the type of the optimized model to see which compiler was faster:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ifuLyQsM9697", "outputId": "c1534e0d-e5bb-4d44-91e9-652593751d52" }, "outputs": [], "source": [ "optimized_model" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "4WxcxrUC9698" }, "source": [ "In our case, the optimized model type was TorchScriptInferenceLearner, so this means that TorchScriptCompiler was the faster compiler." ] }, { "cell_type": "markdown", "metadata": { "id": "iwHKfT349698" }, "source": [ "After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement" ] }, { "cell_type": "markdown", "metadata": { "id": "-IMJpfcb9698" }, "source": [ "First of all, let's print the results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uI8Kd1Z49698", "outputId": "832d3053-d6c8-4cc2-9b48-a59dfaa45d33" }, "outputs": [], "source": [ "res_original" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0I_zSpv29698", "outputId": "a0ba566d-6730-4954-8dd0-eb47b549cbf1" }, "outputs": [], "source": [ "res_optimized" ] }, { "cell_type": "markdown", "metadata": { "id": "hBEtrYOd9699" }, "source": [ "Then, let's compare the performances:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "GqxiCAbpfcwV" }, "outputs": [], "source": [ "from nebullvm.tools.benchmark import benchmark" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_0b0Bzwq-czD" }, "outputs": [], "source": [ "# Set the model to eval mode and move it to the available device\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "model.eval()\n", "model.to(device)" ] }, { "cell_type": "markdown", "metadata": { "id": "UqxzStjD2v0r" }, "source": [ "Here we compute the average throughput for the baseline model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dkt67_Orwlv4", "outputId": "fc10c03c-c3ad-44d4-9fd6-c9b6dc0256c7" }, "outputs": [], "source": [ "benchmark(model, input_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "AgOv-GqQ3KIC" }, "source": [ "Here we compute the average throughput for the optimized model:\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4PodpaDVfwzT", "outputId": "27a42560-93a2-4c19-e68d-360093fe914c" }, "outputs": [], "source": [ "benchmark(optimized_model, input_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "tBeRKNTI3iyK" }, "source": [ "## Scenario 2 - Accuracy drop" ] }, { "cell_type": "markdown", "metadata": { "id": "w3wutIzfAMe_" }, "source": [ "In this scenario, we set a max threshold for the accuracy drop to 2%" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fO1nGqpj3p7z" }, "outputs": [], "source": [ "import torch\n", "import torchvision.models as models\n", "from speedster import optimize_model\n", "\n", "# Load a ViT model\n", "model = ViT(\n", " image_size = 256,\n", " patch_size = 32,\n", " num_classes = 1000,\n", " dim = 1024,\n", " depth = 6,\n", " heads = 16,\n", " mlp_dim = 2048,\n", " dropout = 0.1,\n", " emb_dropout = 0.1\n", ").to(device)\n", "\n", "# Provide 100 random input data for the model \n", "input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\", metric=\"accuracy\", metric_drop_ths=0.02\n", ")\n", "\n", "# Try the optimized model\n", "x = torch.randn(1, 3, 256, 256).to(device)\n", "res = optimized_model(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qFKHaHM6-GKm" }, "outputs": [], "source": [ "# Set the model to eval mode and move it to the available device\n", "\n", "model.eval()\n", "model.to(device)" ] }, { "cell_type": "markdown", "metadata": { "id": "yfW9kmHX-pGi" }, "source": [ "Here we compute the average throughput for the baseline model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0MMrL3959hli", "outputId": "2e8d27ec-a9f3-4f70-8c75-a0df974f2653" }, "outputs": [], "source": [ "benchmark(model, input_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "i3GqasOM-u8f" }, "source": [ "Here we compute the average throughput for the optimized model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_IbAW0KA4Fm5", "outputId": "48d83c89-5687-42aa-a3b8-6989bcb66aa6" }, "outputs": [], "source": [ "benchmark(optimized_model, input_data)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": 13, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": 14, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "3c977e4a", "metadata": { "id": "3c977e4a" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "6240f0ea", "metadata": { "id": "6240f0ea" }, "source": [ "# Accelerate PyTorch YOLOv5 with Speedster\n", "\n" ] }, { "cell_type": "markdown", "id": "6cfcd562", "metadata": { "id": "6cfcd562" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n", "\n", "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "id": "38171e92", "metadata": {}, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "id": "okgu97ThVwnH", "metadata": { "id": "okgu97ThVwnH" }, "source": [ "### Install Speedster" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all" ] }, { "cell_type": "markdown", "id": "e62f5afa", "metadata": { "id": "e62f5afa" }, "source": [ "### Install and test YOLO" ] }, { "cell_type": "markdown", "id": "b38d727d", "metadata": { "id": "b38d727d" }, "source": [ "Let's install YOLO." ] }, { "cell_type": "code", "execution_count": null, "id": "f48f6a35", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f48f6a35", "outputId": "5b06307a-9196-4e5e-a542-1254d6c94ce2", "scrolled": true }, "outputs": [], "source": [ "! pip install -r https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt" ] }, { "attachments": {}, "cell_type": "markdown", "id": "92f49833", "metadata": { "id": "92f49833" }, "source": [ "We start by downloading the model from the Torch hub." ] }, { "cell_type": "code", "execution_count": null, "id": "2dc46f67", "metadata": { "id": "2dc46f67" }, "outputs": [], "source": [ "import copy\n", "import time\n", "import types\n", "\n", "import torch" ] }, { "cell_type": "code", "execution_count": null, "id": "ead6637d", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 248, "referenced_widgets": [ "7f41159d22fe4ce7b8e7789a92478242", "2ecf6a6cfad64af698a88479ba95005b", "e7a2646ac0cd4afba67823799147ce13", "fd77306783b84b489b90d072a44a27d8", "94a4bc5454074b5c900186a60a950d19", "682cafb37aa34c75961d61d2665a50b7", "5e71284dc02f4346b217732643c90b86", "881f619ee75547a49c6d48fd3140721c", "56a1b99b282a4a63a64f48347963a5ab", "a59557bb103e4a3b96062c60d539db35", "65786546f69b420b9ec8451c97338f30" ] }, "id": "ead6637d", "outputId": "8d44d380-535d-446c-fcb0-bb55ba9e9f84" }, "outputs": [], "source": [ "# Load Model\n", "model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, force_reload=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "KcteQ5tsWy1v", "metadata": { "id": "KcteQ5tsWy1v" }, "outputs": [], "source": [ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)" ] }, { "cell_type": "markdown", "id": "37d07ab0", "metadata": { "id": "37d07ab0" }, "source": [ "## Optimization with Speedster" ] }, { "cell_type": "markdown", "id": "332cbc38", "metadata": { "id": "332cbc38" }, "source": [ "Now we are ready for optimizing the body of YOLOv5 using the `Speedster` function `optimize_model`." ] }, { "attachments": {}, "cell_type": "markdown", "id": "d1fc4d01", "metadata": { "id": "d1fc4d01" }, "source": [ "Speedster was built to be very easy to use. To optimize a model, you only need to specify the model, the batch size and input size for each input tensor, and a directory in which to save the optimized model. In the example, we chose the same directory in which this notebook runs.\n", "\n", "With the latest API, there are two ways to use Speedster:\n", "\n", "- Option A: Accelerate the model up to ~10 times without losing in performances (accuracy/precision/etc.)\n", "- Option B: Accelerate the model up to ~30 times with a pre-defined maximum loss in performances\n", " \n", "To learn more about how to use Speedster, check out the readme on GitHub ." ] }, { "cell_type": "markdown", "id": "ceb07403", "metadata": { "id": "ceb07403" }, "source": [ "In this example, we provide the code to run option B." ] }, { "cell_type": "code", "execution_count": null, "id": "74f9f650", "metadata": { "id": "74f9f650" }, "outputs": [], "source": [ "from speedster import optimize_model, save_model, load_model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b729ccce", "metadata": {}, "source": [ "Let's load some example data to feed the optimize_model function" ] }, { "cell_type": "code", "execution_count": null, "id": "20c15b09", "metadata": { "id": "20c15b09" }, "outputs": [], "source": [ "from PIL import Image\n", "import requests\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "id": "8fcf6332", "metadata": { "id": "8fcf6332" }, "outputs": [], "source": [ "img_name = \"zidane.png\"\n", "imgs = ['https://ultralytics.com/images/zidane.jpg'] # batch of images\n", "Image.open(requests.get(imgs[0], stream=True).raw).save(img_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "178a31f1", "metadata": { "id": "178a31f1" }, "outputs": [], "source": [ "def read_and_crop(im, original_model, img_size):\n", " p = next(original_model.parameters())\n", " im = Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im)\n", " max_y, max_x = im.size\n", " ptr_x = np.random.choice(max_x-img_size[0])\n", " ptr_y = np.random.choice(max_y-img_size[1])\n", " im = np.array(im.crop((ptr_y, ptr_x, ptr_y + img_size[1], ptr_x + img_size[0])))\n", " x = np.expand_dims(im, axis=0)\n", " x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2))) # stack and BHWC to BCHW\n", " x = torch.from_numpy(x).to(p.device).type_as(p) / 255 # uint8 to fp16/32\n", " return x" ] }, { "cell_type": "code", "execution_count": null, "id": "51757959", "metadata": { "id": "51757959" }, "outputs": [], "source": [ "input_data = [((read_and_crop(img_name, model, (640, 640)),), None) for _ in range(100)]" ] }, { "cell_type": "code", "execution_count": null, "id": "c01adfeb", "metadata": { "id": "c01adfeb" }, "outputs": [], "source": [ "model_optimized = optimize_model(\n", " model=model,\n", " input_data=input_data,\n", " optimization_time=\"unconstrained\",\n", " metric_drop_ths=0.05\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "495c1642", "metadata": {}, "source": [ "Let's compare the original model performance with the optimized one:" ] }, { "cell_type": "code", "execution_count": null, "id": "82e39d5b", "metadata": { "id": "82e39d5b" }, "outputs": [], "source": [ "from nebullvm.tools.benchmark import benchmark\n", "\n", "original_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, force_reload=True)\n", "print(\"Benchmark original model\")\n", "benchmark(original_model, input_data)\n", "\n", "print(\"Benchmark optimized model\")\n", "benchmark(model_optimized, input_data)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f0d6d006", "metadata": {}, "source": [ "Let's ensure that the output of the original model is the same as the optimized model" ] }, { "cell_type": "code", "execution_count": null, "id": "66c0dbab", "metadata": {}, "outputs": [], "source": [ "input_tensor = torch.randn(1, 3, 640, 640).to(device)" ] }, { "cell_type": "code", "execution_count": null, "id": "bfe573fd", "metadata": {}, "outputs": [], "source": [ "model(input_tensor)" ] }, { "cell_type": "code", "execution_count": null, "id": "89654058", "metadata": {}, "outputs": [], "source": [ "model_optimized(input_tensor)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b72bdf54", "metadata": {}, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ada71f91", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "99b3a9d0", "metadata": {}, "outputs": [], "source": [ "save_model(model_optimized, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "6308ddd7", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "f9946f6b", "metadata": {}, "outputs": [], "source": [ "model_optimized = load_model(\"model_save_path\")\n" ] }, { "cell_type": "markdown", "id": "d50807de", "metadata": { "id": "d50807de" }, "source": [ "What an amazing result, right?!? Stay tuned for more cool content from the Nebuly team :) " ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" } }, "widgets": { "application/vnd.jupyter.widget-state+json": { "2ecf6a6cfad64af698a88479ba95005b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_682cafb37aa34c75961d61d2665a50b7", "placeholder": "​", "style": "IPY_MODEL_5e71284dc02f4346b217732643c90b86", "value": "100%" } }, "56a1b99b282a4a63a64f48347963a5ab": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "5e71284dc02f4346b217732643c90b86": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "65786546f69b420b9ec8451c97338f30": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "682cafb37aa34c75961d61d2665a50b7": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7f41159d22fe4ce7b8e7789a92478242": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_2ecf6a6cfad64af698a88479ba95005b", "IPY_MODEL_e7a2646ac0cd4afba67823799147ce13", "IPY_MODEL_fd77306783b84b489b90d072a44a27d8" ], "layout": "IPY_MODEL_94a4bc5454074b5c900186a60a950d19" } }, "881f619ee75547a49c6d48fd3140721c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "94a4bc5454074b5c900186a60a950d19": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a59557bb103e4a3b96062c60d539db35": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e7a2646ac0cd4afba67823799147ce13": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_881f619ee75547a49c6d48fd3140721c", "max": 14808437, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_56a1b99b282a4a63a64f48347963a5ab", "value": 14808437 } }, "fd77306783b84b489b90d072a44a27d8": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a59557bb103e4a3b96062c60d539db35", "placeholder": "​", "style": "IPY_MODEL_65786546f69b420b9ec8451c97338f30", "value": " 14.1M/14.1M [00:00<00:00, 24.5MB/s]" } } } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb ================================================ { "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "![New Release: Accelerate YOLOv8](assets/yolov8.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Accelerate Ultralytics YOLOv8 with Speedster" ] }, { "cell_type": "markdown", "id": "6cfcd562", "metadata": { "id": "6cfcd562" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster module from the open-source library nebullvm.\n", "\n", "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Install Speedster" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Install Ultralytics YOLOv8" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install ultralytics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load YOLOv8s" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from ultralytics import YOLO\n", "\n", "yolo = YOLO('yolov8s.pt')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Let's load a test dummy data and see the original output" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_data = torch.randn(1, 3, 640, 640)\n", "yolo.model(test_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The original YOLOv8 model return as output a tuple where the first element is a tensor and the second is a list of tensors. Speedster currently supports only models that return only tensors, so we need to create a wrapper to overcome this issue:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "class YOLOWrapper(torch.nn.Module):\n", " def __init__(self, yolo_model):\n", " super().__init__()\n", " self.model = yolo_model.model\n", " \n", " def forward(self, x, *args, **kwargs):\n", " res = self.model(x)\n", " return res[0], *tuple(res[1])\n", " \n", "model_wrapper = YOLOWrapper(yolo)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## YOLOv8s Optimization with GPU" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can now optimize the model using speedster:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from speedster import optimize_model\n", "\n", "# Provide some input data for the model \n", "input_data = [((torch.randn(1, 3, 640, 640), ), torch.tensor([0])) for i in range(100)]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model_wrapper, input_data=input_data, metric_drop_ths=0.1, store_latencies=True\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can finally restore the original output format by wrapping the optimized model in a new class:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class OptimizedYOLO(torch.nn.Module):\n", " def __init__(self, optimized_model):\n", " super().__init__()\n", " self.model = optimized_model\n", " \n", " def forward(self, x, *args, **kwargs):\n", " res = self.model(x)\n", " return res[0], list(res[1:])\n", " \n", "optimized_wrapper = OptimizedYOLO(optimized_model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "optimized_wrapper(test_data.cuda())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## YOLOv8s Optimization with CPU" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from speedster import optimize_model, save_model, load_model\n", "from ultralytics import YOLO\n", "\n", "yolo = YOLO('yolov8s.pt')\n", "model_wrapper = YOLOWrapper(yolo)\n", "\n", "# Provide some input data for the model \n", "input_data = [((torch.randn(1, 3, 640, 640), ), torch.tensor([0])) for i in range(100)]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model_wrapper, input_data=input_data, metric_drop_ths=0.1, store_latencies=True, device=\"cpu\"\n", ")\n", "\n", "optimized_wrapper = OptimizedYOLO(optimized_model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "optimized_wrapper(test_data)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b72bdf54", "metadata": {}, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ada71f91", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "99b3a9d0", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "6308ddd7", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "f9946f6b", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")\n", "optimized_wrapper = OptimizedYOLO(optimized_model)" ] }, { "cell_type": "markdown", "id": "d50807de", "metadata": { "id": "d50807de" }, "source": [ "What an amazing result, right?!? Stay tuned for more cool content from the Nebuly team :) " ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "wQS9kNoyjsKe" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Accelerate Fast AI ResNet34 with Speedster" ] }, { "cell_type": "markdown", "metadata": { "id": "hBObeC3SmRwl" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the open-source library nebullvm." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "metadata": { "id": "87jOeOOtktQy" }, "source": [ "### Fine-tune a fast.ai model\n" ] }, { "cell_type": "markdown", "metadata": { "id": "XlVUVGOAlS6O" }, "source": [ "For the tutorial, we will use a fast.ai notebook for beginners in which we will classify whether the input image contains a cat (True label) or a dog (False label). Let's jump to the code.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9cFt-FEvlNkG" }, "outputs": [], "source": [ "from fastai.vision.all import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GqdMEBPZlmpu", "outputId": "18d8a166-9b5d-4c91-cbc7-c8591bd5c0d2" }, "outputs": [], "source": [ "path = untar_data(URLs.PETS)\n", "files = get_image_files(path/\"images\")\n", "\n", "def label_func(f): return f[0].isupper()\n", "\n", "dls = ImageDataLoaders.from_name_func(path, files, label_func, item_tfms=Resize(224), num_workers=0)\n", "dls.show_batch()" ] }, { "cell_type": "markdown", "metadata": { "id": "VrmI4VeZlhJG" }, "source": [ "After downloading a sample of images of dogs and cats, we fine-tune the fast.ai model.\n", "\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MJ8q9xxBlv1x", "outputId": "8169f902-3dd0-449c-c293-91fb7ab94003" }, "outputs": [], "source": [ "learn = cnn_learner(dls, resnet34, metrics=error_rate)\n", "learn.fine_tune(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RBzr8_47lxsW", "outputId": "b87781d6-2826-4cc6-9fd3-57da5cdcbbd4" }, "outputs": [], "source": [ "valid_loss, error = learn.validate()" ] }, { "cell_type": "markdown", "metadata": { "id": "WSWq0il6l0eC" }, "source": [ "Now that we have fine-tuned the model, let's calculate the time required to run a prediction as an average over 100 tests.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "o_iMOqI_l6-Y" }, "outputs": [], "source": [ "import time" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JNZXAgIYl883" }, "outputs": [], "source": [ "%%capture\n", "times = []\n", "for _ in range(100):\n", " st = time.time()\n", " preds = learn.predict(files[0])\n", " times.append((time.time()-st)*1000)\n", "fastai_vanilla_time = sum(times)/len(times)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "N9IDkfyDmADn", "outputId": "0113620d-4c77-4a9f-ae1e-e64b0cb32293" }, "outputs": [], "source": [ "print(f\"Average prediction time: {fastai_vanilla_time} ms,\\nPrediction: {preds}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "hlwl87jRmBy2" }, "outputs": [], "source": [ "#learn.save(\".\")" ] }, { "cell_type": "markdown", "metadata": { "id": "bes-NoZnmhyy" }, "source": [ "### Install nebullvm" ] }, { "cell_type": "markdown", "id": "48aljCHu14-H", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install nebullvm:" ] }, { "cell_type": "code", "execution_count": null, "id": "QFQh3BVr1-GO", "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "id": "8a7a86b3", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "id": "cffbfa32", "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data preparation" ] }, { "cell_type": "markdown", "metadata": { "id": "zVfy0VBooG_J" }, "source": [ "Now we prepare the dataset so that it can be processed by Speedster." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RuUavpyooIBT" }, "outputs": [], "source": [ "import torch\n", "\n", "xs, ys = [], []\n", "for i, (x, y) in enumerate(dls.train):\n", " if i >=100:\n", " break\n", " xs.append(x)\n", " ys.append(y)\n", "xs = torch.cat(xs, dim=0)\n", "ys = torch.cat(ys, dim=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kkVzQVmgoMQh" }, "outputs": [], "source": [ "dl_nebullvm = [((x.unsqueeze(dim=0),), y.unsqueeze(0)) for x, y in zip(xs, ys)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_Eb_AAeqoOUS" }, "outputs": [], "source": [ "original_model = learn.model" ] }, { "cell_type": "markdown", "metadata": { "id": "0siBvWcsnv49" }, "source": [ "### Unconstrained without accuracy loss (thus constrained)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ToxCH47qstn9" }, "outputs": [], "source": [ "import torch\n", "import torchvision.models as models\n", "from speedster import optimize_model, save_model, load_model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "njoWqCSzvzpr" }, "outputs": [], "source": [ "# Load a resnet as example\n", "model = original_model\n", "\n", "# Provide an input data for the model \n", "input_data = dl_nebullvm\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\",\n", ")\n", "\n", "# Try the optimized model\n", "# x = torch.randn(1, 3, 224, 224)\n", "# res = optimized_model(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GGRbJL6Xq6Ns" }, "outputs": [], "source": [ "optimized_model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "h75V23FSs2MZ" }, "outputs": [], "source": [ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set the model to eval mode and move it to the available device\n", "model.eval()\n", "model.to(device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "R_QrrT0oq1i_" }, "outputs": [], "source": [ "res_optimized = optimized_model(x)\n", "res_optimized" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xtjV8pDYxIIl" }, "outputs": [], "source": [ "from nebullvm.tools.benchmark import benchmark\n", "\n", "benchmark(model, input_data)\n", "benchmark(optimized_model, input_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "lWJCMGGJxaG5" }, "source": [ "### Unconstrained with 2% accuracy loss" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "g9Huil4-xeX5" }, "outputs": [], "source": [ "# Load a resnet as example\n", "model = original_model\n", "\n", "# Provide an input data for the model \n", "input_data = dl_nebullvm\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\", metric_drop_ths=0.02, metric=\"accuracy\"\n", ")\n", "\n", "# Try the optimized model\n", "# x = torch.randn(1, 3, 224, 224)\n", "# res = optimized_model(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cLxoOzxe4clI" }, "outputs": [], "source": [ "# Set the model to eval mode and move it to the available device\n", "model.eval()\n", "model.to(device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c3QvxwUD4clI" }, "outputs": [], "source": [ "optimized_model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dRLd4QMJ4clI" }, "outputs": [], "source": [ "benchmark(model, input_data)\n", "benchmark(optimized_model, input_data)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ceb60d8c", "metadata": { "id": "ceb60d8c" }, "source": [ "## Save and reload the optimized model" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d9eda1a0", "metadata": {}, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "id": "62b6fcbf", "metadata": {}, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3c968d51", "metadata": {}, "source": [ "We can then load again the model:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1340c49", "metadata": {}, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/pytorch/Readme.md ================================================ # **PyTorch Optimization** This section contains all the available notebooks that show how to leverage Speedster to optimize PyTorch models. ## Notebooks: | Notebook | Description | | |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Accelerate Torchvision Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model loaded from Torchvision. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb) | | [Accelerate Fast AI Resnet34](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet34 model loaded from Fast AI. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb) | | [Accelerate PyTorch ViT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb) | Show how to optimize with Speedster a PyTorch ViT model. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb) | | [Accelerate Ultralytics YOLOv5](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb) | Show how to optimize with Speedster a YOLOv5 model from Ultralytics. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb) | | [Accelerate Ultralytics YOLOv8](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb) | Show how to optimize with Speedster a YOLOv8 model from Ultralytics. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb) | ## PyTorch API quick view: ``` python import torch import torchvision.models as models from speedster import optimize_model # Load a resnet as example model = models.resnet50() # Provide an input data for the model input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))] # Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="unconstrained" ) # Try the optimized model x = torch.randn(1, 3, 256, 256) ## Warmup the model ## This step is necessary before the latency computation of the ## optimized model in order to get reliable results. # for _ in range(10): # optimized_model(x) res = optimized_model(x) ``` ================================================ FILE: optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "p5b0PzpW1xJq" }, "source": [ "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" ] }, { "cell_type": "markdown", "metadata": { "id": "-KdJPm7M05Jc" }, "source": [ "# Accelerate Tensorflow ResNet50 with Speedster" ] }, { "cell_type": "markdown", "metadata": { "id": "T9xuwZEHzN2K" }, "source": [ "Hi and welcome 👋\n", "\n", "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library `nebullvm`.\n", "\n", "We will\n", "1. Install Speedster and the deep learning compilers used by the library.\n", "2. Speed up a PyTorch ResNet50 without any loss of accuracy.\n", "3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n", "\n", "Let's jump to the code." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KIeIvBPVLQuq" }, "outputs": [], "source": [ "%env CUDA_VISIBLE_DEVICES=0" ] }, { "cell_type": "markdown", "metadata": { "id": "HbFy2Aykz2Qo" }, "source": [ "### Installation" ] }, { "cell_type": "markdown", "metadata": { "id": "48aljCHu14-H" }, "source": [ "Install Speedster:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QFQh3BVr1-GO" }, "outputs": [], "source": [ "!pip install speedster" ] }, { "cell_type": "markdown", "metadata": { "id": "8a7a86b3" }, "source": [ "Install deep learning compilers:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cffbfa32" }, "outputs": [], "source": [ "!python -m nebullvm.installers.auto_installer --frameworks tensorflow --compilers all" ] }, { "cell_type": "markdown", "metadata": { "id": "N5RXHoZl0p3p" }, "source": [ "## Optimization example with Tensorflow" ] }, { "cell_type": "markdown", "metadata": { "id": "-Ju-VcRH01Mw" }, "source": [ "In the following example we will try to optimize a standard resnet50 loaded directly from keras.\n", "\n", "Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\n", "\n", "Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \"accuracy\", metric_drop_ths = 0.02)." ] }, { "cell_type": "markdown", "metadata": { "id": "skxEuemn171G" }, "source": [ "### Scenario 1 - No accuracy drop" ] }, { "cell_type": "markdown", "metadata": { "id": "wVRLXrDi2VaG" }, "source": [ "First we load the model and optimize it using the Speedster API:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2RbgGruAeQcf" }, "outputs": [], "source": [ "# If you encountered any error, run the cell again\n", "import tensorflow as tf\n", "from tensorflow.keras.applications.resnet50 import ResNet50\n", "from speedster import optimize_model, save_model, load_model\n", "\n", "# Load a resnet as example\n", "model = ResNet50()\n", "\n", "# Provide an input data for the model \n", "input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0]))]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\"\n", ")\n", "\n", "# Try the optimized model\n", "x = tf.random.normal([1, 224, 224, 3])\n", "res_original = model.predict(x)\n", "res_optimized = optimized_model.predict(x)[0]" ] }, { "cell_type": "markdown", "metadata": { "id": "NGrk6_jwRubP" }, "source": [ "We can print the type of the optimized model to see which compiler was faster:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cVMn6erJLQuu" }, "outputs": [], "source": [ "optimized_model" ] }, { "cell_type": "markdown", "metadata": { "id": "aT0BhdIKR7gY" }, "source": [ "In our case, the optimized model type was TensorflowNvidiaInferenceLearner, so this means that Tensor RT was the faster compiler." ] }, { "cell_type": "markdown", "metadata": { "id": "JMiuufyu2gD3" }, "source": [ "After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement" ] }, { "cell_type": "markdown", "metadata": { "id": "Swpr-Wi5Si9a" }, "source": [ "First of all, let's print the results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MjGtKkeZSOc7" }, "outputs": [], "source": [ "res_original" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dhe94Tk3SSfn" }, "outputs": [], "source": [ "res_optimized" ] }, { "cell_type": "markdown", "metadata": { "id": "UqxzStjD2v0r" }, "source": [ "Then, let's compute the average latency of the baseline model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ELyTjg6_S4Us" }, "outputs": [], "source": [ "import time" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dkt67_Orwlv4" }, "outputs": [], "source": [ "num_iters = 100\n", "\n", "# Warmup\n", "for i in range(10):\n", " model.predict(x)\n", "\n", "start = time.time()\n", "for i in range(num_iters):\n", " model.predict(x)\n", "stop = time.time()\n", "\n", "print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))" ] }, { "cell_type": "markdown", "metadata": { "id": "AgOv-GqQ3KIC" }, "source": [ "Finally we compute the average latency for the optimized model:\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4PodpaDVfwzT" }, "outputs": [], "source": [ "# Warmup\n", "for i in range(10):\n", " optimized_model.predict(x)\n", "\n", "start = time.time()\n", "for i in range(num_iters):\n", " optimized_model.predict(x)\n", "stop = time.time()\n", "\n", "print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))" ] }, { "cell_type": "markdown", "metadata": { "id": "tBeRKNTI3iyK" }, "source": [ "### Scenario 2 - Accuracy drop" ] }, { "cell_type": "markdown", "metadata": { "id": "w3wutIzfAMe_" }, "source": [ "In this scenario, we set a max threshold for the accuracy drop to 2%" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fO1nGqpj3p7z" }, "outputs": [], "source": [ "import tensorflow as tf\n", "from tensorflow.keras.applications.resnet50 import ResNet50\n", "from speedster import optimize_model\n", "\n", "# Load a resnet as example\n", "model = ResNet50()\n", "\n", "# Provide an input data for the model \n", "# Note that in this case we should provide the model at least 100 data samples\n", "input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for i in range(100)]\n", "\n", "# Run Speedster optimization\n", "optimized_model = optimize_model(\n", " model, input_data=input_data, optimization_time=\"unconstrained\", metric = \"accuracy\", metric_drop_ths = 0.02\n", ")\n", "\n", "# Try the optimized model\n", "x = tf.random.normal([1, 224, 224, 3])\n", "res_original = model.predict(x)\n", "res_optimized = optimized_model.predict(x)[0]" ] }, { "cell_type": "markdown", "metadata": { "id": "yfW9kmHX-pGi" }, "source": [ "Here we compute the average throughput for the baseline model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0MMrL3959hli" }, "outputs": [], "source": [ "num_iters = 100\n", "\n", "# Warmup\n", "for i in range(10):\n", " model.predict(x)\n", "\n", "start = time.time()\n", "for i in range(num_iters):\n", " model.predict(x)\n", "stop = time.time()\n", "\n", "print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))" ] }, { "cell_type": "markdown", "metadata": { "id": "i3GqasOM-u8f" }, "source": [ "Here we compute the average throughput for the optimized model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_IbAW0KA4Fm5" }, "outputs": [], "source": [ "# Warmup\n", "for i in range(10):\n", " optimized_model.predict(x)\n", "\n", "start = time.time()\n", "for i in range(num_iters):\n", " optimized_model.predict(x)\n", "stop = time.time()\n", "\n", "print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))" ] }, { "cell_type": "markdown", "metadata": { "id": "4XFMC1S6zXTU" }, "source": [ "## Save and reload the optimized model" ] }, { "cell_type": "markdown", "metadata": { "id": "OXHVr3EAzbT5" }, "source": [ "We can easily save to disk the optimized model with the following line:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3M565P-zzaFB" }, "outputs": [], "source": [ "save_model(optimized_model, \"model_save_path\")" ] }, { "cell_type": "markdown", "metadata": { "id": "ee8CS_Evzg1j" }, "source": [ "We can then load again the model:\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zOQ88SY_zg-A" }, "outputs": [], "source": [ "optimized_model = load_model(\"model_save_path\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b77ff2ac", "metadata": { "id": "b77ff2ac" }, "source": [ "
\n", " Join the community |\n", " Contribute to the library \n", "
\n", "\n", "
\n", " How speedster works •\n", " Documentation •\n", " Quick start \n", "
" ] } ], "metadata": { "accelerator": "GPU", "colab": { "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: optimization/speedster/notebooks/tensorflow/Readme.md ================================================ # **Tensorflow Optimization** This section contains all the available notebooks that show how to leverage Speedster to optimize Tensorflow models. ## Notebooks: | Notebook | Description | | |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Accelerate Keras Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model loaded from keras. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb) | ## Tensorflow API quick view: ``` python import tensorflow as tf from tensorflow.keras.applications.resnet50 import ResNet50 from speedster import optimize_model # Load a resnet as example model = ResNet50() # Provide an input data for the model input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0]))] # Run Speedster optimization optimized_model = optimize_model( model, input_data=input_data, optimization_time="unconstrained" ) # Try the optimized model x = tf.random.normal([1, 224, 224, 3]) res_original = model.predict(x) ## Warmup the model ## This step is necessary before the latency computation of the ## optimized model in order to get reliable results. # for _ in range(10): # optimized_model.predict(x) res_optimized = optimized_model.predict(x)[0] ``` ================================================ FILE: optimization/speedster/requirements.txt ================================================ nebullvm>=0.10.0 tabulate>=0.8.0 ================================================ FILE: optimization/speedster/setup.py ================================================ from pathlib import Path from setuptools import setup, find_packages REQUIREMENTS = [ "nebullvm>=0.9.0", "tabulate>=0.8.0", ] this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text(encoding="utf8") setup( name="speedster", version="0.4.0", packages=find_packages(), install_requires=REQUIREMENTS, long_description=long_description, include_package_data=True, long_description_content_type="text/markdown", ) ================================================ FILE: optimization/speedster/speedster/__init__.py ================================================ from speedster.api.functions import optimize_model # noqa: F401 from nebullvm.operations.inference_learners.utils import ( # noqa: F401 load_model, save_model, ) ================================================ FILE: optimization/speedster/speedster/api/__init__.py ================================================ ================================================ FILE: optimization/speedster/speedster/api/functions.py ================================================ import logging from typing import ( Union, Iterable, Sequence, Callable, Dict, List, Optional, ) from nebullvm.config import DEFAULT_METRIC_DROP_THS from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from nebullvm.tools.logger import debug_mode_enabled, LoggingContext from speedster.root_op import SpeedsterRootOp from nebullvm.tools.utils import check_device def optimize_model( model: Union[torch.nn.Module, tf.Module, str], input_data: Union[Iterable, Sequence], metric_drop_ths: float = DEFAULT_METRIC_DROP_THS, metric: Union[str, Callable] = None, optimization_time: str = "constrained", dynamic_info: Dict = None, config_file: str = None, ignore_compilers: List[str] = None, ignore_compressors: List[str] = None, store_latencies: bool = False, device: Optional[str] = None, **kwargs, ): """Optimize the input model regardless of the framework it was used for implementing it. The optimized model given as output will share with the input one the same API, i.e. the optimized model will have the same interface as the original one. Args: model (Union[torch.Module, tf.Module, str]): The input model. It can be a torch or tensorflow model or a path to an onnx saved model. input_data (Iterable or Sequence): Input data to be used for optimizing the model. Note that if 'unconstrained' is selected as `optimization_time`, it would be beneficial to provide at least 100 data samples in order to use all the techniques supported by Nebullvm. The data can be given in either as sequence (data can be accessed by "element", e.g. `data[i]`) or iterable (data needs to be accessed with loop, e.g. `for x in data`). PyTorch, TensorFlow and Onnx respectively accept input tensor in `torch.Tensor`, `tf.Tensor` and `np.ndarray` formats. Note that each input sample must be a tuple containing a tuple as first element, the `inputs`, and the `label` as second element. The `inputs` needs to be passed as tuple even if a single input is needed by the model (in this case the `inputs` tuple will contain just an element). HuggingFace models can take as data samples both dictionaries or strings. Strings will then be converted in data samples using the HuggingFace tokenizer which must be given as input when just a list of string is provided as input_data (tokenizers can be passed as extra arguments of this function using the keyword `tokenizer`). metric_drop_ths (float, optional): Maximum reduction in the selected metric accepted. No model with a higher error will be accepted, i.e. all optimized model having a larger error respect to the original one will be discarded, without even considering their possible speed-up. Default: None, i.e. no drop in metric accepted. metric (Union[Callable, str], optional): The metric to be used for accepting or refusing a precision-reduction optimization proposal. If none is given but a `metric_drop_ths` is received, the `nebullvm.measure.compute_relative_difference` metric will be used as default one. A user-defined metric can be passed as function accepting as inputs two tuples of tensors (produced by the baseline and the optimized model) and the related original labels. For more information see `nebullvm.measure.compute_relative_difference` and `nebullvm.measure.compute_accuracy_drop`. `metric` accepts as value also a string containing the metric name. At the current stage the supported metrics are `"numeric_precision"` and `"accuracy"`. Default: `"numeric_precision"` optimization_time (OptimizationTime, optional): The optimization time mode. It can be either 'constrained' or 'unconstrained'. For 'constrained' mode just compilers and precision reduction techniques are used (no compression). 'Unconstrained' optimization allows the usage of more time-consuming techniques as pruning and distillation. Note that for using many of the sophisticated techniques in the 'unconstrained' optimization, a small fine-tuning of the model will be needed. Thus we highly recommend to give as input_data at least 100 samples for when selecting 'unconstrained' optimization. Default: 'constrained'. dynamic_info (Dict, optional): Dictionary containing info about the dynamic axis. It should contain as keys both "inputs" and "outputs" and as values two lists of dictionaries where each dictionary represents the dynamic axis information for an input/output tensor. The inner dictionary should have as key an integer, i.e. the dynamic axis (considering also the batch size) and as value a string giving a "tag" to it, e.g. "batch_size". Default: None config_file (str, optional): Configuration file containing the parameters needed for defining the CompressionStep in the pipeline. Default: None. ignore_compilers (List, optional): List containing the compilers to be ignored during the OptimizerStep. The compiler name should be one among tvm, tensor RT, openvino, onnxruntime, deepsparse, tflite, bladedisc, torchscript, intel_neural_compressor. Default: None. ignore_compressors (List, optional): List containing the compressors to be ignored during the CompressionStep. The compiler name should be one among . Default: None. store_latencies (bool, optional): Parameter that allows to save the latency for each compiler used by nebullvm. Default: False. device (str, optional): Device used, can be 'cpu' or 'gpu'. If not set, gpu will be used if available, otherwise cpu. Default: None Returns: InferenceLearner: Optimized version of the input model having the same interface, imported by its original framework. For instance a Pytorch model, when optimized, will return an InferenceLearner object that can be call exactly as a PyTorch model (either with `model.forward(input)` and `model(input)`), i.e. it will take as input and it will return `torch.Tensor`s. """ root_op = SpeedsterRootOp() device = check_device(device) disable_log = True if not debug_mode_enabled() else False with LoggingContext(logging.getLogger(), disabled=disable_log): return root_op.to(device).execute( model=model, input_data=input_data, metric_drop_ths=metric_drop_ths, metric=metric, optimization_time=optimization_time, dynamic_info=dynamic_info, config_file=config_file, ignore_compilers=ignore_compilers, ignore_compressors=ignore_compressors, store_latencies=store_latencies, **kwargs, ) ================================================ FILE: optimization/speedster/speedster/api/tests/__init__.py ================================================ ================================================ FILE: optimization/speedster/speedster/api/tests/test_huggingface.py ================================================ from tempfile import TemporaryDirectory from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST from nebullvm.operations.inference_learners.huggingface import ( HuggingFaceInferenceLearner, ) from nebullvm.optional_modules.tensorflow import tensorflow as tf from nebullvm.optional_modules.torch import torch from transformers import AlbertModel, TFAlbertModel, AlbertTokenizer from speedster import optimize_model, load_model def test_torch_huggingface_ort_input_text(): tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") model = AlbertModel.from_pretrained("albert-base-v1") # Move the model to gpu if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() input_data = [ "this is a test", "hi my name is Valerio", "india is very far from italy", ] optimized_model = optimize_model( model=model, input_data=input_data, optimization_time="constrained", tokenizer=tokenizer, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], tokenizer_args=dict( add_special_tokens=True, return_attention_mask=True, return_tensors="pt", return_token_type_ids=None, # Sets to model default padding="longest", truncation=True, ), ) # save and load with TemporaryDirectory() as tmp_dir: optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance(loaded_model, HuggingFaceInferenceLearner) assert isinstance(loaded_model.get_size(), int) x = ["this is a test input to see if the optimized model works."] inputs = tokenizer(x, return_tensors="pt").to(device) model.to(device) res_original = model(**inputs) res_optimized = optimized_model(**inputs) assert isinstance(optimized_model, HuggingFaceInferenceLearner) assert ( torch.mean( abs( ( res_original["last_hidden_state"] - res_optimized["last_hidden_state"] ) ) ) < 1e-2 ) assert ( torch.mean( abs( ( res_original["pooler_output"] - res_optimized["pooler_output"] ) ) ) < 1e-2 ) def test_torch_huggingface_ort_input_tensors(): tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") model = AlbertModel.from_pretrained("albert-base-v1") # Move the model to gpu if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() text = "hi my name is Valerio" inputs = tokenizer(text, return_tensors="pt").to(device) dynamic_info = { "inputs": [ {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, ], "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}], } optimized_model = optimize_model( model=model, input_data=[inputs for _ in range(10)], optimization_time="constrained", ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], dynamic_info=dynamic_info, ) x = ["this is a test input to see if the optimized model works."] inputs = tokenizer(x, return_tensors="pt").to(device) model.to(device) res_original = model(**inputs) res_optimized = optimized_model(**inputs) assert isinstance(optimized_model, HuggingFaceInferenceLearner) assert ( torch.mean( abs( ( res_original["last_hidden_state"] - res_optimized["last_hidden_state"] ) ) ) < 1e-2 ) assert ( torch.mean( abs( ( res_original["pooler_output"] - res_optimized["pooler_output"] ) ) ) < 1e-2 ) def test_torch_huggingface_torchscript_input_tensors(): tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") model = AlbertModel.from_pretrained("albert-base-v1", torchscript=True) # Move the model to gpu if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() text = "hi my name is Valerio" inputs = tokenizer(text, return_tensors="pt").to(device) dynamic_info = { "inputs": [ {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, ], "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}], } optimized_model = optimize_model( model=model, input_data=[inputs for _ in range(10)], optimization_time="constrained", ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "torchscript" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], dynamic_info=dynamic_info, ) x = ["this is a test input to see if the optimized model works."] inputs = tokenizer(x, return_tensors="pt").to(device) model.to(device) res_original = model(**inputs) res_optimized = optimized_model(**inputs) assert isinstance(optimized_model, HuggingFaceInferenceLearner) assert torch.mean(abs((res_original[0] - res_optimized[0]))) < 1e-2 assert torch.mean(abs((res_original[1] - res_optimized[1]))) < 1e-2 def test_tensorflow_huggingface_ort_input_text_np(): tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") model = TFAlbertModel.from_pretrained("albert-base-v1") input_data = [ "this is a test", "hi my name is Valerio", "india is very far from italy", ] dynamic_info = { "inputs": [ {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, ], "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}], } optimized_model = optimize_model( model=model, input_data=input_data, optimization_time="constrained", tokenizer=tokenizer, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], tokenizer_args=dict( add_special_tokens=True, return_attention_mask=True, return_tensors="np", return_token_type_ids=None, # Sets to model default padding="longest", truncation=True, ), dynamic_info=dynamic_info, ) x = ["this is a test input to see if the optimized model works."] inputs = tokenizer(x, return_tensors="np") res_original = model(**inputs) res_optimized = optimized_model(**inputs) assert isinstance(optimized_model, HuggingFaceInferenceLearner) assert ( tf.math.reduce_max( abs( ( res_original["last_hidden_state"] - res_optimized["last_hidden_state"] ) ) ) < 1e-2 ) assert ( tf.math.reduce_max( abs( ( res_original["pooler_output"] - res_optimized["pooler_output"] ) ) ) < 1e-2 ) def test_tensorflow_huggingface_ort_input_tensors_np(): tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") model = TFAlbertModel.from_pretrained("albert-base-v1") text = "hi my name is Valerio" inputs = tokenizer(text, return_tensors="np") dynamic_info = { "inputs": [ {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, ], "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}], } optimized_model = optimize_model( model=model, input_data=[inputs for _ in range(10)], optimization_time="constrained", ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], dynamic_info=dynamic_info, ) x = ["Test to see if it works with a different output"] inputs = tokenizer(x, return_tensors="np") res_original = model(**inputs) res_optimized = optimized_model(**inputs) assert isinstance(optimized_model, HuggingFaceInferenceLearner) assert ( tf.math.reduce_max( abs( ( res_original["last_hidden_state"] - res_optimized["last_hidden_state"] ) ) ) < 1e-2 ) assert ( tf.math.reduce_max( abs( ( res_original["pooler_output"] - res_optimized["pooler_output"] ) ) ) < 1e-2 ) def test_tensorflow_huggingface_ort_input_text_tf(): tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") model = TFAlbertModel.from_pretrained("albert-base-v1") input_data = [ "this is a test", "hi my name is Valerio", "india is very far from italy", ] dynamic_info = { "inputs": [ {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, ], "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}], } optimized_model = optimize_model( model=model, input_data=input_data, optimization_time="constrained", tokenizer=tokenizer, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], tokenizer_args=dict( add_special_tokens=True, return_attention_mask=True, return_tensors="tf", return_token_type_ids=None, # Sets to model default padding="longest", truncation=True, ), dynamic_info=dynamic_info, ) x = ["this is a test input to see if the optimized model works."] inputs = tokenizer(x, return_tensors="tf") res_original = model(**inputs) res_optimized = optimized_model(**inputs) assert isinstance(optimized_model, HuggingFaceInferenceLearner) assert ( tf.math.reduce_max( abs( ( res_original["last_hidden_state"] - res_optimized["last_hidden_state"] ) ) ) < 1e-2 ) assert ( tf.math.reduce_max( abs( ( res_original["pooler_output"] - res_optimized["pooler_output"] ) ) ) < 1e-2 ) def test_tensorflow_huggingface_ort_input_tensors_tf(): tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1") model = TFAlbertModel.from_pretrained("albert-base-v1") text = "hi my name is Valerio" inputs = tokenizer(text, return_tensors="tf") dynamic_info = { "inputs": [ {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, {0: "batch", 1: "num_tokens"}, ], "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}], } optimized_model = optimize_model( model=model, input_data=[inputs for _ in range(10)], optimization_time="constrained", ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], dynamic_info=dynamic_info, ) x = ["Test to see if it works with a different output"] inputs = tokenizer(x, return_tensors="tf") res_original = model(**inputs) res_optimized = optimized_model(**inputs) assert isinstance(optimized_model, HuggingFaceInferenceLearner) assert ( tf.math.reduce_max( abs( ( res_original["last_hidden_state"] - res_optimized["last_hidden_state"] ) ) ) < 1e-2 ) assert ( tf.math.reduce_max( abs( ( res_original["pooler_output"] - res_optimized["pooler_output"] ) ) ) < 1e-2 ) ================================================ FILE: optimization/speedster/speedster/api/tests/test_onnx.py ================================================ import cpuinfo from tempfile import TemporaryDirectory import numpy as np import pytest import torch from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST from nebullvm.operations.inference_learners.onnx import ( NumpyONNXInferenceLearner, ) from nebullvm.operations.inference_learners.openvino import ( NumpyOpenVinoInferenceLearner, ) from nebullvm.operations.inference_learners.tensor_rt import ( NumpyONNXTensorRTInferenceLearner, ) from nebullvm.operations.inference_learners.tvm import ( NumpyApacheTVMInferenceLearner, ) from nebullvm.operations.optimizations.compilers.utils import tvm_is_available from torchvision import models from speedster import optimize_model, load_model from speedster.api.tests.utils import torch_to_onnx def test_onnx_ort(): with TemporaryDirectory() as tmp_dir: model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] model_path = torch_to_onnx(model, input_data, tmp_dir) input_data = [ ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model_path, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) with TemporaryDirectory() as tmp_dir: optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance(loaded_model, NumpyONNXInferenceLearner) assert isinstance(loaded_model.get_size(), int) # Try the optimized model device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) x = torch.randn(1, 3, 256, 256, requires_grad=False) model.to(device).eval() with torch.inference_mode(): res_original = model(x.to(device)) res_optimized = optimized_model(x.numpy())[0] assert ( abs( (res_original.detach().cpu().numpy() - res_optimized) ).max() < 1e-2 ) def test_onnx_ort_quant(): with TemporaryDirectory() as tmp_dir: model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] model_path = torch_to_onnx(model, input_data, tmp_dir) input_data = [ ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model_path, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], metric_drop_ths=2, ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device).eval() x = torch.randn(1, 3, 256, 256, requires_grad=False) with torch.inference_mode(): res_original = model(x.to(device)) res_optimized = optimized_model(x.numpy())[0] assert isinstance(optimized_model, NumpyONNXInferenceLearner) assert ( abs((res_original.detach().cpu().numpy() - res_optimized)).max() < 1 ) @pytest.mark.skipif( not torch.cuda.is_available(), reason="Skip because cuda is not available.", ) def test_onnx_tensorrt(): with TemporaryDirectory() as tmp_dir: model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] model_path = torch_to_onnx(model, input_data, tmp_dir) input_data = [ ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model_path, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "tensor_rt" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False) model.to(device).eval() with torch.inference_mode(): res_original = model(x.to(device)) res_optimized = optimized_model(x.numpy())[0] assert isinstance(optimized_model, NumpyONNXTensorRTInferenceLearner) assert ( abs((res_original.detach().cpu().numpy() - res_optimized)).max() < 1e-2 ) @pytest.mark.skipif( "intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(), reason="Openvino is only available for intel processors.", ) def test_onnx_openvino(): with TemporaryDirectory() as tmp_dir: model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] model_path = torch_to_onnx(model, input_data, tmp_dir) input_data = [ ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model_path, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "openvino" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], device="cpu", ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False) model.to(device).eval() with torch.inference_mode(): res_original = model(x.to(device)) res_optimized = optimized_model(x.numpy())[0] assert isinstance(optimized_model, NumpyOpenVinoInferenceLearner) assert ( abs((res_original.detach().cpu().numpy() - res_optimized)).max() < 1e-2 ) @pytest.mark.skipif( not tvm_is_available(), reason="Can't test tvm if it's not installed." ) def test_onnx_tvm(): with TemporaryDirectory() as tmp_dir: model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] model_path = torch_to_onnx(model, input_data, tmp_dir) input_data = [ ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model_path, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "tvm" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False) model.to(device).eval() with torch.inference_mode(): res_original = model(x.to(device)) res_optimized = optimized_model(x.numpy())[0] assert isinstance(optimized_model, NumpyApacheTVMInferenceLearner) assert ( abs((res_original.detach().cpu().numpy() - res_optimized)).max() < 1e-2 ) ================================================ FILE: optimization/speedster/speedster/api/tests/test_pytorch.py ================================================ import cpuinfo from tempfile import TemporaryDirectory import pytest import torch import torchvision.models as models from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST from nebullvm.operations.inference_learners.blade_disc import ( BladeDISCInferenceLearner, ) from nebullvm.operations.inference_learners.onnx import ( PytorchONNXInferenceLearner, ) from nebullvm.operations.inference_learners.openvino import ( PytorchOpenVinoInferenceLearner, ) from nebullvm.operations.inference_learners.tensor_rt import ( PytorchTensorRTInferenceLearner, PytorchONNXTensorRTInferenceLearner, ) from nebullvm.operations.inference_learners.torch_dynamo import ( TorchDynamoInferenceLearner, ) from nebullvm.operations.inference_learners.torchscript import ( TorchScriptInferenceLearner, ) from nebullvm.operations.inference_learners.tvm import ( PytorchApacheTVMInferenceLearner, ) from nebullvm.operations.optimizations.compilers.utils import ( tvm_is_available, bladedisc_is_available, ) from speedster import optimize_model, load_model from nebullvm.tools.utils import check_module_version def test_torch_ort(): model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) with TemporaryDirectory() as tmp_dir: optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance(loaded_model, PytorchONNXInferenceLearner) assert isinstance(loaded_model.get_size(), int) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device) model.to(device).eval() res_original = model(x) res_optimized = optimized_model(x)[0] assert isinstance(optimized_model, PytorchONNXInferenceLearner) assert torch.max(abs((res_original - res_optimized))) < 1e-2 def test_torch_ort_quant(): model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], metric_drop_ths=2, ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device) model.to(device).eval() res_original = model(x) res_optimized = optimized_model(x)[0] assert isinstance(optimized_model, PytorchONNXInferenceLearner) assert torch.max(abs((res_original - res_optimized))) < 2 def test_torch_torchscript(): model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "torchscript" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device) model.to(device).eval() res_original = model(x) res_optimized = optimized_model(x)[0] assert isinstance(optimized_model, TorchScriptInferenceLearner) assert torch.max(abs((res_original - res_optimized))) < 1e-2 @pytest.mark.skipif( not check_module_version(torch, min_version="2.0.0") or True, reason="Torch version is not supported", ) def test_torch_torch_dynamo(): model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "torch_dynamo" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device) model.to(device).eval() res_original = model(x) res_optimized = optimized_model(x)[0] assert isinstance(optimized_model, TorchDynamoInferenceLearner) assert torch.max(abs((res_original - res_optimized))) < 1e-2 @pytest.mark.skipif( not torch.cuda.is_available(), reason="Skip because cuda is not available.", ) @pytest.mark.skipif( not check_module_version(torch, max_version="1.13.1+cu117"), reason="Skip because torch version is not supported.", ) def test_torch_tensorrt(): model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "tensor_rt" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model x = torch.randn(1, 3, 256, 256).cuda() model.cuda().eval() res_original = model(x) res_optimized = optimized_model(x)[0] assert isinstance( optimized_model, PytorchTensorRTInferenceLearner ) or isinstance(optimized_model, PytorchONNXTensorRTInferenceLearner) assert torch.max(abs((res_original - res_optimized))) < 1e-2 @pytest.mark.skipif( "intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(), reason="Openvino is only available for intel processors.", ) def test_torch_openvino(): model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "openvino" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], device="cpu", ) # Try the optimized model x = torch.randn(1, 3, 256, 256) model.eval() res_original = model(x) res_optimized = optimized_model(x)[0] assert isinstance(optimized_model, PytorchOpenVinoInferenceLearner) assert torch.max(abs((res_original.cpu() - res_optimized))) < 1e-2 @pytest.mark.skipif( not tvm_is_available(), reason="Can't test tvm if it's not installed." ) def test_torch_tvm(): model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "tvm" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device) model.to(device).eval() res_original = model(x) res_optimized = optimized_model(x)[0] assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner) assert torch.max(abs((res_original - res_optimized))) < 1e-2 @pytest.mark.skipif( not bladedisc_is_available(), reason="Can't test bladedisc if it's not installed.", ) def test_torch_bladedisc(): model = models.resnet18() input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "bladedisc" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device) model.to(device).eval() res_original = model(x) res_optimized = optimized_model(x)[0] assert isinstance(optimized_model, BladeDISCInferenceLearner) assert torch.max(abs((res_original - res_optimized))) < 1e-2 ================================================ FILE: optimization/speedster/speedster/api/tests/test_tensorflow.py ================================================ from tempfile import TemporaryDirectory import cpuinfo import pytest import tensorflow as tf from keras.applications import ResNet50 from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST from nebullvm.operations.inference_learners.onnx import ( TensorflowONNXInferenceLearner, ) from nebullvm.operations.inference_learners.openvino import ( TensorflowOpenVinoInferenceLearner, ) from nebullvm.operations.inference_learners.tensor_rt import ( TensorflowONNXTensorRTInferenceLearner, ) from nebullvm.operations.inference_learners.tensorflow import ( TensorflowBackendInferenceLearner, TFLiteBackendInferenceLearner, ) from nebullvm.operations.inference_learners.tvm import ( TensorflowApacheTVMInferenceLearner, ) from nebullvm.operations.optimizations.compilers.utils import tvm_is_available from nebullvm.tools.utils import gpu_is_available from speedster import optimize_model, load_model # Limit tensorflow gpu memory usage gpus = tf.config.list_physical_devices("GPU") if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.set_visible_devices(gpus[0], "GPU") tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.list_logical_devices("GPU") print( len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs" ) except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) def test_tensorflow_ort(): model = ResNet50() input_data = [ ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "onnxruntime" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) with TemporaryDirectory() as tmp_dir: optimized_model.save(tmp_dir) loaded_model = load_model(tmp_dir) assert isinstance(loaded_model, TensorflowONNXInferenceLearner) assert isinstance(loaded_model.get_size(), int) # Try the optimized model x = tf.random.normal([1, 224, 224, 3]) res_original = model.predict(x) res_optimized = optimized_model.predict(x)[0] assert isinstance(optimized_model, TensorflowONNXInferenceLearner) assert abs((res_original - res_optimized)).max() < 1e-2 def test_tensorflow_tf_backend(): model = ResNet50() input_data = [ ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "xla" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model x = tf.random.normal([1, 224, 224, 3]) res_original = model.predict(x) res_optimized = optimized_model.predict(x)[0] assert isinstance(optimized_model, TensorflowBackendInferenceLearner) assert abs((res_original - res_optimized)).max() < 1e-2 @pytest.mark.skipif( gpu_is_available(), reason="TFLite does not support Nvidia GPUs", ) def test_tensorflow_tflite(): model = ResNet50() input_data = [ ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "tflite" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], metric_drop_ths=0.1, ) # Try the optimized model x = tf.random.normal([1, 224, 224, 3]) res_original = model.predict(x) res_optimized = optimized_model.predict(x)[0] assert isinstance(optimized_model, TFLiteBackendInferenceLearner) assert abs((res_original - res_optimized)).max() < 1e-2 @pytest.mark.skipif( not gpu_is_available(), reason="Skip because cuda is not available.", ) def test_tensorflow_tensorrt(): model = ResNet50() input_data = [ ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "tensor_rt" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model x = tf.random.normal([1, 224, 224, 3]) res_original = model.predict(x) res_optimized = optimized_model.predict(x)[0] assert isinstance(optimized_model, TensorflowONNXTensorRTInferenceLearner) assert abs((res_original - res_optimized)).max() < 1e-2 @pytest.mark.skipif( "intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(), reason="Openvino is only available for intel processors.", ) def test_tensorflow_openvino(): model = ResNet50() input_data = [ ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "openvino" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], device="cpu", ) # Try the optimized model x = tf.random.normal([1, 224, 224, 3]) res_original = model.predict(x) res_optimized = optimized_model.predict(x)[0] assert isinstance(optimized_model, TensorflowOpenVinoInferenceLearner) assert abs((res_original - res_optimized)).max() < 1e-2 @pytest.mark.skipif( not tvm_is_available(), reason="Can't test tvm if it's not installed." ) def test_tensorflow_tvm(): model = ResNet50() input_data = [ ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100) ] # Run nebullvm optimization in one line of code optimized_model = optimize_model( model, input_data=input_data, ignore_compilers=[ compiler for compiler in COMPILER_LIST if compiler != "tvm" ], ignore_compressors=[compressor for compressor in COMPRESSOR_LIST], ) # Try the optimized model x = tf.random.normal([1, 224, 224, 3]) res_original = model.predict(x) res_optimized = optimized_model.predict(x)[0] assert isinstance(optimized_model, TensorflowApacheTVMInferenceLearner) assert abs((res_original - res_optimized)).max() < 1e-2 ================================================ FILE: optimization/speedster/speedster/api/tests/utils.py ================================================ import os from pathlib import Path from nebullvm.core.models import ModelParams, Device, DeviceType from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx from nebullvm.tools.data import DataManager from nebullvm.tools.utils import gpu_is_available def torch_to_onnx(model, input_data, output_path): model_params = ModelParams(1, [], [], []) output_path = os.path.join(output_path, "model.onnx") device = Device(DeviceType.GPU if gpu_is_available() else DeviceType.CPU) convert_torch_to_onnx( model, DataManager(input_data), model_params, Path(output_path), device ) return output_path ================================================ FILE: optimization/speedster/speedster/root_op.py ================================================ import json import pickle import sys from typing import ( Any, Union, Iterable, Sequence, Dict, Callable, List, ) from loguru import logger from nebullvm import setup_logger from nebullvm.config import MIN_NUMBER from nebullvm.core.models import OptimizeInferenceResult, DeviceType from nebullvm.operations.base import Operation from nebullvm.operations.optimizations.optimize_inference import ( OptimizeInferenceOp, ) from nebullvm.tools.data import DataManager from nebullvm.tools.feedback_collector import FeedbackCollector from tabulate import tabulate from nebullvm.tools.hardware_utils import get_hw_setup from nebullvm.tools.utils import ( get_model_size_mb, get_model_name, generate_model_id, ) SPEEDSTER_FEEDBACK_COLLECTOR = FeedbackCollector( url="https://nebuly.cloud/v1/store_speedster_results", disable_telemetry_environ_var="SPEEDSTER_DISABLE_TELEMETRY", app_version="0.4.0", ) def _convert_technique(technique: str): if technique.lower() == "none": # use fp32 instead of none technique = "fp32" elif technique == "HALF": technique = "fp16" elif technique == "STATIC": technique = "int8" else: technique = "int8_dynamic" return technique def _get_model_len(model: Any): try: return len(pickle.dumps(model, -1)) except Exception: logger.warning( "Cannot pickle input model. Unable to " "extract original model size" ) # Model is not pickable return -1 class SpeedsterRootOp(Operation): def __init__(self): super().__init__() self.optimize_inference_op = OptimizeInferenceOp() self.set_feedback_collector(SPEEDSTER_FEEDBACK_COLLECTOR) def _send_feedback( self, optimization_result: OptimizeInferenceResult, store_latencies: bool = False, ): model_orig = optimization_result.original_model.model model_name = get_model_name(model_orig) model_info = { "model_name": model_name, "model_size": f"{get_model_size_mb(model_orig)} MB", "framework": optimization_result.original_model.framework.value, } self.feedback_collector.store_info( key="model_id", value=generate_model_id(model_orig) ) self.feedback_collector.store_info( key="model_metadata", value=model_info ) self.feedback_collector.store_info( key="hardware_setup", value=get_hw_setup(self.device).__dict__ ) optimizations = self.feedback_collector.get("optimizations") original_model_dict = { "compiler": optimization_result.original_model.framework.value, "technique": "original", "latency": optimization_result.original_model.latency_seconds, } optimizations.insert(0, original_model_dict) self.feedback_collector.send_feedback() if store_latencies: model_id = self.feedback_collector.get("model_id", "") with open( f"{model_name}_latencies_{model_id[:10]}.json", "w" ) as f: json.dump( { "optimizations": optimizations, }, f, ) self.feedback_collector.reset("optimizations") self.feedback_collector.reset("model_id") self.feedback_collector.reset("model_metadata") def execute( self, model: Any, input_data: Union[Iterable, Sequence, DataManager], metric_drop_ths: float = None, metric: Union[str, Callable] = None, optimization_time: str = "constrained", dynamic_info: Dict = None, config_file: str = None, ignore_compilers: List[str] = None, ignore_compressors: List[str] = None, store_latencies: bool = False, **kwargs, ): self.logger.info( "Running Speedster on {}{}".format( self.device.type.name, f":{self.device.idx}" if self.device.type is not DeviceType.CPU else "", ) ) result = self.optimize_inference_op.to(self.device).execute( model=model, input_data=input_data, metric_drop_ths=metric_drop_ths, metric=metric, optimization_time=optimization_time, dynamic_info=dynamic_info, config_file=config_file, ignore_compilers=ignore_compilers, ignore_compressors=ignore_compressors, store_latencies=store_latencies, **kwargs, ) if result.optimized_model is None: return None opt_metric_drop = ( f"{result.metric_drop:.4f}" if result.metric_drop > MIN_NUMBER else "0" ) self._send_feedback(result, store_latencies=store_latencies) table = [ [ "backend", result.original_model.framework.name, result.optimized_model.inference_learner.name, "", ], [ "latency", f"{result.original_model.latency_seconds:.4f} sec/batch", f"{result.optimized_model.latency_seconds:.4f} sec/batch", f"{result.original_model.latency_seconds / result.optimized_model.latency_seconds:.2f}x", # noqa: E501 ], [ "throughput", f"{result.original_model.throughput:.2f} " f"data/sec", f"{result.optimized_model.throughput:.2f} " f"data/sec", f"{result.optimized_model.throughput / result.original_model.throughput:.2f}x", # noqa: E501 ], [ "model size", f"{result.original_model.size_mb:.2f} MB", f"{result.optimized_model.size_mb:.2f} MB", f"{min(int((result.optimized_model.size_mb-result.original_model.size_mb) / result.original_model.size_mb * 100), 0)}%" # noqa: E501 if result.original_model.size_mb > 0 else "NA", ], ["metric drop", "", opt_metric_drop, ""], [ "techniques", "", f"{_convert_technique(result.optimized_model.technique)}", "", ], ] headers = [ "Metric", "Original Model", "Optimized Model", "Improvement", ] # change format to the logger, avoiding printing verbose info # to the console (as date, time, etc.) self.logger.remove() handler_id = self.logger.add( sys.stdout, format="{message}" ) hw_info = get_hw_setup(self.device) hw_name = ( hw_info.cpu if self.device.type is DeviceType.CPU else hw_info.accelerator ) self.logger.info( ( f"\n[Speedster results on {hw_name}]\n" f"{tabulate(table, headers, tablefmt='heavy_outline')}" ) ) if ( result.original_model.latency_seconds / result.optimized_model.latency_seconds < 2 ): self.logger.warning( f"\nMax speed-up with your input parameters is " f"{result.original_model.latency_seconds / result.optimized_model.latency_seconds:.2f}x. " # noqa: E501 f"If you want to get a faster optimized model, " f"see the following link for some suggestions: " f"https://docs.nebuly.com/Speedster/advanced_" f"options/#acceleration-suggestions\n" ) self.logger.remove(handler_id) setup_logger() return result.optimized_model.inference_learner ================================================ FILE: optimization/speedster/speedster/speedster.py ================================================ from nebullvm.apps.base import App from speedster.root_op import SpeedsterRootOp class SpeedsterApp(App): def __init__(self): super().__init__() self.root_op = SpeedsterRootOp() def execute(self, *args, **kwargs): return self.root_op.execute(*args, **kwargs) ================================================ FILE: optimization/speedster/speedster/tests/__init__.py ================================================ ================================================ FILE: optimization/speedster/speedster/tests/test_root_op.py ================================================ from nebullvm.core.models import OptimizeInferenceResult from speedster.root_op import SpeedsterRootOp def test_root_op_no_optim_model(mocker): root_op = SpeedsterRootOp() mocker.patch.object( root_op.optimize_inference_op, "execute", return_value=OptimizeInferenceResult( original_model=mocker.MagicMock(), optimized_model=None, hardware_setup=mocker.MagicMock(), ), ) res = root_op.execute( model=None, input_data=mocker.MagicMock(), metric_drop_ths=None, metric="latency", optimization_time=mocker.MagicMock(), dynamic_info=None, config_file=None, ignore_compilers=None, ignore_compressors=None, store_latencies=False, ) assert res is None def test_root_op_optim_model(mocker): root_op = SpeedsterRootOp() mocker.patch.object( root_op.optimize_inference_op, "execute", return_value=OptimizeInferenceResult( original_model=mocker.MagicMock( latency_seconds=1, throughput=1, size_mb=1 ), optimized_model=mocker.MagicMock( metric_drop=0.1, latency_seconds=1, size_mb=1, throughput=1 ), hardware_setup=mocker.MagicMock(), ), ) mocker.patch.object(root_op, "_send_feedback") res = root_op.execute( model=None, input_data=mocker.MagicMock(), metric_drop_ths=None, metric="latency", optimization_time=mocker.MagicMock(), dynamic_info=None, config_file=None, ignore_compilers=None, ignore_compressors=None, store_latencies=False, ) assert res is not None ================================================ FILE: optimization/speedster/speedster/utils.py ================================================ ================================================ FILE: optimization/speedster/speedster.toml ================================================ [build-system] requires = [ "setuptools>=42", "wheel" ] build-backend = "setuptools.build_meta"