[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\noptimization/nebullvm/docs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n.idea\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# MacOS DS_Store\n.DS_Store\n\n# Pickle folder\n.pkl_memoize_py3\n\n# Folder where optimized models are stored\noptimized_model\n\n# Config file for tests coverage\n.coveragerc\n"
  },
  {
    "path": "CITATION.cff",
    "content": "cff-version: 1.2.0\nmessage: \"If you use this software, please cite it as below.\"\nauthors:\n- family-names: \"Nebuly\"\n  given-names: \"S.r.l\"\n- family-names: \"Fiori\"\n  given-names: \"Diego\"\n  orcid: \"https://orcid.org/0000-0003-1910-0565\"\n- family-names: \"Sofi\"\n  given-names: \"Valerio\"\n  orcid: \"https://orcid.org/0000-0001-5978-897X\"\ntitle: \"nebullvm\"\nversion: 0.4.3\ndate-released: 2022-10-10\nurl: \"https://github.com/nebuly-ai/nebullvm\"\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participation in our\ncommunity a harassment-free experience for everyone, regardless of age, body\nsize, visible or invisible disability, ethnicity, sex characteristics, gender\nidentity and expression, level of experience, education, socio-economic status,\nnationality, personal appearance, race, religion, or sexual identity\nand orientation.\n\nWe pledge to act and interact in ways that contribute to an open, welcoming,\ndiverse, inclusive, and healthy community.\n\n## Our Standards\n\nExamples of behavior that contributes to a positive environment for our\ncommunity include:\n\n* Demonstrating empathy and kindness toward other people\n* Being respectful of differing opinions, viewpoints, and experiences\n* Giving and gracefully accepting constructive feedback\n* Accepting responsibility and apologizing to those affected by our mistakes,\n  and learning from the experience\n* Focusing on what is best not just for us as individuals, but for the\n  overall community\n\nExamples of unacceptable behavior include:\n\n* The use of sexualized language or imagery, and sexual attention or\n  advances of any kind\n* Trolling, insulting or derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or email\n  address, without their explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Enforcement Responsibilities\n\nCommunity leaders are responsible for clarifying and enforcing our standards of\nacceptable behavior and will take appropriate and fair corrective action in\nresponse to any behavior that they deem inappropriate, threatening, offensive,\nor harmful.\n\nCommunity leaders have the right and responsibility to remove, edit, or reject\ncomments, commits, code, wiki edits, issues, and other contributions that are\nnot aligned to this Code of Conduct, and will communicate reasons for moderation\ndecisions when appropriate.\n\n## Scope\n\nThis Code of Conduct applies within all community spaces, and also applies when\nan individual is officially representing the community in public spaces.\nExamples of representing our community include using an official e-mail address,\nposting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported to the community leaders responsible for enforcement at\nsocial@nebuly.ai.\nAll complaints will be reviewed and investigated promptly and fairly.\n\nAll community leaders are obligated to respect the privacy and security of the\nreporter of any incident.\n\n## Enforcement Guidelines\n\nCommunity leaders will follow these Community Impact Guidelines in determining\nthe consequences for any action they deem in violation of this Code of Conduct:\n\n### 1. Correction\n\n**Community Impact**: Use of inappropriate language or other behavior deemed\nunprofessional or unwelcome in the community.\n\n**Consequence**: A private, written warning from community leaders, providing\nclarity around the nature of the violation and an explanation of why the\nbehavior was inappropriate. A public apology may be requested.\n\n### 2. Warning\n\n**Community Impact**: A violation through a single incident or series\nof actions.\n\n**Consequence**: A warning with consequences for continued behavior. No\ninteraction with the people involved, including unsolicited interaction with\nthose enforcing the Code of Conduct, for a specified period of time. This\nincludes avoiding interactions in community spaces as well as external channels\nlike social media. Violating these terms may lead to a temporary or\npermanent ban.\n\n### 3. Temporary Ban\n\n**Community Impact**: A serious violation of community standards, including\nsustained inappropriate behavior.\n\n**Consequence**: A temporary ban from any sort of interaction or public\ncommunication with the community for a specified period of time. No public or\nprivate interaction with the people involved, including unsolicited interaction\nwith those enforcing the Code of Conduct, is allowed during this period.\nViolating these terms may lead to a permanent ban.\n\n### 4. Permanent Ban\n\n**Community Impact**: Demonstrating a pattern of violation of community\nstandards, including sustained inappropriate behavior,  harassment of an\nindividual, or aggression toward or disparagement of classes of individuals.\n\n**Consequence**: A permanent ban from any sort of public interaction within\nthe community.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage],\nversion 2.0, available at\nhttps://www.contributor-covenant.org/version/2/0/code_of_conduct.html.\n\nCommunity Impact Guidelines were inspired by [Mozilla's code of conduct\nenforcement ladder](https://github.com/mozilla/diversity).\n\n[homepage]: https://www.contributor-covenant.org\n\nFor answers to common questions about this code of conduct, see the FAQ at\nhttps://www.contributor-covenant.org/faq. Translations are available at\nhttps://www.contributor-covenant.org/translations.\n"
  },
  {
    "path": "README.md",
    "content": "# OptiMate\n\n**[Legacy]**\n\nThis repository is now in a legacy phase and is no longer actively maintained. Although the source code is still available in the Git history, there will be no additional updates or official support.\n\n**[About Nebuly]**\n\nOur team is fully committed on creating the best user-experience platform for LLMs so that companies can understand user behavior at scale when interacting with their LLM-based products. \n- To learn more on how to get started, visit our [official documentation](https://docs.nebuly.com/welcome/overview)\n- If you need enterprise support, please contact us [here](https://www.nebuly.com/nebuly-book-a-demo)\n\n**[About optimate]**\n\nWe have open-sourced a couple of internal projects to the community, but we are not currently maintaining them. Optimate is a collection of libraries designed to help you optimize your AI models. It is an open-source project developed by Nebuly AI but is **not actively maintained**.\n\nThe tools available to assist you in your optimization are:\n\n✅ [Speedster](https://github.com/nebuly-ai/optimate/tree/main/optimization/speedster): reduce inference costs by leveraging SOTA optimization techniques that best couple your AI models with the underlying hardware (GPUs and CPUs)\n\n✅ [Nos](https://github.com/nebuly-ai/nos): reduce infrastructure costs by leveraging real-time dynamic partitioning and elastic quotas to maximize the utilization of your Kubernetes GPU cluster\n\n✅ [ChatLLaMA](https://github.com/nebuly-ai/optimate/tree/main/optimization/chatllama): reduce hardware and data costs by leveraging fine-tuning optimization techniques and RLHF alignment\n"
  },
  {
    "path": "monitoring/nebuly/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/.github/workflows/tests.yml",
    "content": "name: Run tests\n\non:\n  push:\n    branches:\n      - \"main\"\n    paths-ignore:\n      - \".github/**\"\n      - \"*.md\"\n      - \"docs/**\"\n      - \"notebooks/**\"\n  pull_request:\n    branches:\n      - \"main\"\n    paths-ignore:\n      - \".github/**\"\n      - \"*.md\"\n      - \"docs/**\"\n      - \"notebooks/**\"\n\njobs:\n  test_on_ubuntu_cpu:\n    runs-on: ubuntu-20.04\n\n    strategy:\n      matrix:\n        # Run in all these versions of Python\n        python-version: [ 3.8, 3.9, \"3.10\" ]\n\n    steps:\n      # Checkout the latest code from the repo\n      - name: Checkout repo\n        uses: actions/checkout@v2\n        # Setup which version of Python to use\n      - name: Set Up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n        # Display the Python version being used\n      - name: Display Python version\n        run: python -c \"import sys; print(sys.version)\"\n        # Install nebullvm\n      - name: Install nebullvm\n        run: |\n          python -m pip install --upgrade pip\n          pip install .\n        # Install Speedster\n      - name: Install Speedster\n        run: |\n          cd apps/accelerate/speedster\n          pip install .\n          cd ../../..\n        # Install PyTorch\n      - name: Install PyTorch\n        run: python -m pip install torch==2.0.0\n        # Install compilers except tvm\n      - name: Install deep learning compilers\n        run: python -m nebullvm.installers.auto_installer --compilers all\n        # Install requirements for testing\n      - name: Install requirements for testing\n        run: pip install -r \"requirements-dev.txt\"\n        # Run api tests\n      - name: Run api tests\n        run: |\n          export SPEEDSTER_DISABLE_TELEMETRY=1\n          cd apps/accelerate/speedster\n          pytest\n          cd ../../..\n        # Run components tests\n      - name: Run components tests\n        run: |\n          cd nebullvm\n          pytest\n          cd ../\n\n#  test_on_windows_cpu:\n#    runs-on: windows-latest\n#\n#    strategy:\n#      matrix:\n#        # Run in all these versions of Python\n#        python-version: [ 3.8, 3.9, \"3.10\" ]\n#\n#    steps:\n#      # Checkout the latest code from the repo\n#      - name: Checkout repo\n#        uses: actions/checkout@v2\n#        # Setup which version of Python to use\n#      - name: Set Up Python ${{ matrix.python-version }}\n#        uses: actions/setup-python@v2\n#        with:\n#          python-version: ${{ matrix.python-version }}\n#        # Display the Python version being used\n#      - name: Display Python version\n#        run: python -c \"import sys; print(sys.version)\"\n#        # Install nebullvm\n#      - name: Install nebullvm\n#        run: |\n#          python -m pip install --upgrade pip\n#          pip install .\n#        # Install Speedster\n#      - name: Install Speedster\n#        run: |\n#          cd apps/accelerate/speedster\n#          pip install .\n#          cd ../../..\n#      - name: Install PyTorch\n#        run: python -m pip install torch==2.0.0\n#        # Install compilers except tvm\n#      - name: Install deep learning compilers\n#        run: python -m nebullvm.installers.auto_installer --compilers all\n#        # Install requirements for testing\n#      - name: Install requirements for testing\n#        run: pip install -r \"requirements-dev.txt\"\n#        # Run api tests\n#      - name: Run api tests\n#        run: |\n#          $env:SPEEDSTER_DISABLE_TELEMETRY=1\n#          cd apps/accelerate/speedster\n#          pytest\n#          cd ../../..\n#        # Run components tests\n#      - name: Run components tests\n#        run: |\n#          cd nebullvm\n#          pytest\n#          cd ../\n#\n"
  },
  {
    "path": "optimization/chatllama/LICENSE",
    "content": "                    GNU GENERAL PUBLIC LICENSE\n                       Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>\n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n                            Preamble\n\n  The GNU General Public License is a free, copyleft license for\nsoftware and other kinds of works.\n\n  The licenses for most software and other practical works are designed\nto take away your freedom to share and change the works.  By contrast,\nthe GNU General Public License is intended to guarantee your freedom to\nshare and change all versions of a program--to make sure it remains free\nsoftware for all its users.  We, the Free Software Foundation, use the\nGNU General Public License for most of our software; it applies also to\nany other work released this way by its authors.  You can apply it to\nyour programs, too.\n\n  When we speak of free software, we are referring to freedom, not\nprice.  Our General Public Licenses are designed to make sure that you\nhave the freedom to distribute copies of free software (and charge for\nthem if you wish), that you receive source code or can get it if you\nwant it, that you can change the software or use pieces of it in new\nfree programs, and that you know you can do these things.\n\n  To protect your rights, we need to prevent others from denying you\nthese rights or asking you to surrender the rights.  Therefore, you have\ncertain responsibilities if you distribute copies of the software, or if\nyou modify it: responsibilities to respect the freedom of others.\n\n  For example, if you distribute copies of such a program, whether\ngratis or for a fee, you must pass on to the recipients the same\nfreedoms that you received.  You must make sure that they, too, receive\nor can get the source code.  And you must show them these terms so they\nknow their rights.\n\n  Developers that use the GNU GPL protect your rights with two steps:\n(1) assert copyright on the software, and (2) offer you this License\ngiving you legal permission to copy, distribute and/or modify it.\n\n  For the developers' and authors' protection, the GPL clearly explains\nthat there is no warranty for this free software.  For both users' and\nauthors' sake, the GPL requires that modified versions be marked as\nchanged, so that their problems will not be attributed erroneously to\nauthors of previous versions.\n\n  Some devices are designed to deny users access to install or run\nmodified versions of the software inside them, although the manufacturer\ncan do so.  This is fundamentally incompatible with the aim of\nprotecting users' freedom to change the software.  The systematic\npattern of such abuse occurs in the area of products for individuals to\nuse, which is precisely where it is most unacceptable.  Therefore, we\nhave designed this version of the GPL to prohibit the practice for those\nproducts.  If such problems arise substantially in other domains, we\nstand ready to extend this provision to those domains in future versions\nof the GPL, as needed to protect the freedom of users.\n\n  Finally, every program is threatened constantly by software patents.\nStates should not allow patents to restrict development and use of\nsoftware on general-purpose computers, but in those that do, we wish to\navoid the special danger that patents applied to a free program could\nmake it effectively proprietary.  To prevent this, the GPL assures that\npatents cannot be used to render the program non-free.\n\n  The precise terms and conditions for copying, distribution and\nmodification follow.\n\n                       TERMS AND CONDITIONS\n\n  0. Definitions.\n\n  \"This License\" refers to version 3 of the GNU General Public License.\n\n  \"Copyright\" also means copyright-like laws that apply to other kinds of\nworks, such as semiconductor masks.\n\n  \"The Program\" refers to any copyrightable work licensed under this\nLicense.  Each licensee is addressed as \"you\".  \"Licensees\" and\n\"recipients\" may be individuals or organizations.\n\n  To \"modify\" a work means to copy from or adapt all or part of the work\nin a fashion requiring copyright permission, other than the making of an\nexact copy.  The resulting work is called a \"modified version\" of the\nearlier work or a work \"based on\" the earlier work.\n\n  A \"covered work\" means either the unmodified Program or a work based\non the Program.\n\n  To \"propagate\" a work means to do anything with it that, without\npermission, would make you directly or secondarily liable for\ninfringement under applicable copyright law, except executing it on a\ncomputer or modifying a private copy.  Propagation includes copying,\ndistribution (with or without modification), making available to the\npublic, and in some countries other activities as well.\n\n  To \"convey\" a work means any kind of propagation that enables other\nparties to make or receive copies.  Mere interaction with a user through\na computer network, with no transfer of a copy, is not conveying.\n\n  An interactive user interface displays \"Appropriate Legal Notices\"\nto the extent that it includes a convenient and prominently visible\nfeature that (1) displays an appropriate copyright notice, and (2)\ntells the user that there is no warranty for the work (except to the\nextent that warranties are provided), that licensees may convey the\nwork under this License, and how to view a copy of this License.  If\nthe interface presents a list of user commands or options, such as a\nmenu, a prominent item in the list meets this criterion.\n\n  1. Source Code.\n\n  The \"source code\" for a work means the preferred form of the work\nfor making modifications to it.  \"Object code\" means any non-source\nform of a work.\n\n  A \"Standard Interface\" means an interface that either is an official\nstandard defined by a recognized standards body, or, in the case of\ninterfaces specified for a particular programming language, one that\nis widely used among developers working in that language.\n\n  The \"System Libraries\" of an executable work include anything, other\nthan the work as a whole, that (a) is included in the normal form of\npackaging a Major Component, but which is not part of that Major\nComponent, and (b) serves only to enable use of the work with that\nMajor Component, or to implement a Standard Interface for which an\nimplementation is available to the public in source code form.  A\n\"Major Component\", in this context, means a major essential component\n(kernel, window system, and so on) of the specific operating system\n(if any) on which the executable work runs, or a compiler used to\nproduce the work, or an object code interpreter used to run it.\n\n  The \"Corresponding Source\" for a work in object code form means all\nthe source code needed to generate, install, and (for an executable\nwork) run the object code and to modify the work, including scripts to\ncontrol those activities.  However, it does not include the work's\nSystem Libraries, or general-purpose tools or generally available free\nprograms which are used unmodified in performing those activities but\nwhich are not part of the work.  For example, Corresponding Source\nincludes interface definition files associated with source files for\nthe work, and the source code for shared libraries and dynamically\nlinked subprograms that the work is specifically designed to require,\nsuch as by intimate data communication or control flow between those\nsubprograms and other parts of the work.\n\n  The Corresponding Source need not include anything that users\ncan regenerate automatically from other parts of the Corresponding\nSource.\n\n  The Corresponding Source for a work in source code form is that\nsame work.\n\n  2. Basic Permissions.\n\n  All rights granted under this License are granted for the term of\ncopyright on the Program, and are irrevocable provided the stated\nconditions are met.  This License explicitly affirms your unlimited\npermission to run the unmodified Program.  The output from running a\ncovered work is covered by this License only if the output, given its\ncontent, constitutes a covered work.  This License acknowledges your\nrights of fair use or other equivalent, as provided by copyright law.\n\n  You may make, run and propagate covered works that you do not\nconvey, without conditions so long as your license otherwise remains\nin force.  You may convey covered works to others for the sole purpose\nof having them make modifications exclusively for you, or provide you\nwith facilities for running those works, provided that you comply with\nthe terms of this License in conveying all material for which you do\nnot control copyright.  Those thus making or running the covered works\nfor you must do so exclusively on your behalf, under your direction\nand control, on terms that prohibit them from making any copies of\nyour copyrighted material outside their relationship with you.\n\n  Conveying under any other circumstances is permitted solely under\nthe conditions stated below.  Sublicensing is not allowed; section 10\nmakes it unnecessary.\n\n  3. Protecting Users' Legal Rights From Anti-Circumvention Law.\n\n  No covered work shall be deemed part of an effective technological\nmeasure under any applicable law fulfilling obligations under article\n11 of the WIPO copyright treaty adopted on 20 December 1996, or\nsimilar laws prohibiting or restricting circumvention of such\nmeasures.\n\n  When you convey a covered work, you waive any legal power to forbid\ncircumvention of technological measures to the extent such circumvention\nis effected by exercising rights under this License with respect to\nthe covered work, and you disclaim any intention to limit operation or\nmodification of the work as a means of enforcing, against the work's\nusers, your or third parties' legal rights to forbid circumvention of\ntechnological measures.\n\n  4. Conveying Verbatim Copies.\n\n  You may convey verbatim copies of the Program's source code as you\nreceive it, in any medium, provided that you conspicuously and\nappropriately publish on each copy an appropriate copyright notice;\nkeep intact all notices stating that this License and any\nnon-permissive terms added in accord with section 7 apply to the code;\nkeep intact all notices of the absence of any warranty; and give all\nrecipients a copy of this License along with the Program.\n\n  You may charge any price or no price for each copy that you convey,\nand you may offer support or warranty protection for a fee.\n\n  5. Conveying Modified Source Versions.\n\n  You may convey a work based on the Program, or the modifications to\nproduce it from the Program, in the form of source code under the\nterms of section 4, provided that you also meet all of these conditions:\n\n    a) The work must carry prominent notices stating that you modified\n    it, and giving a relevant date.\n\n    b) The work must carry prominent notices stating that it is\n    released under this License and any conditions added under section\n    7.  This requirement modifies the requirement in section 4 to\n    \"keep intact all notices\".\n\n    c) You must license the entire work, as a whole, under this\n    License to anyone who comes into possession of a copy.  This\n    License will therefore apply, along with any applicable section 7\n    additional terms, to the whole of the work, and all its parts,\n    regardless of how they are packaged.  This License gives no\n    permission to license the work in any other way, but it does not\n    invalidate such permission if you have separately received it.\n\n    d) If the work has interactive user interfaces, each must display\n    Appropriate Legal Notices; however, if the Program has interactive\n    interfaces that do not display Appropriate Legal Notices, your\n    work need not make them do so.\n\n  A compilation of a covered work with other separate and independent\nworks, which are not by their nature extensions of the covered work,\nand which are not combined with it such as to form a larger program,\nin or on a volume of a storage or distribution medium, is called an\n\"aggregate\" if the compilation and its resulting copyright are not\nused to limit the access or legal rights of the compilation's users\nbeyond what the individual works permit.  Inclusion of a covered work\nin an aggregate does not cause this License to apply to the other\nparts of the aggregate.\n\n  6. Conveying Non-Source Forms.\n\n  You may convey a covered work in object code form under the terms\nof sections 4 and 5, provided that you also convey the\nmachine-readable Corresponding Source under the terms of this License,\nin one of these ways:\n\n    a) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by the\n    Corresponding Source fixed on a durable physical medium\n    customarily used for software interchange.\n\n    b) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by a\n    written offer, valid for at least three years and valid for as\n    long as you offer spare parts or customer support for that product\n    model, to give anyone who possesses the object code either (1) a\n    copy of the Corresponding Source for all the software in the\n    product that is covered by this License, on a durable physical\n    medium customarily used for software interchange, for a price no\n    more than your reasonable cost of physically performing this\n    conveying of source, or (2) access to copy the\n    Corresponding Source from a network server at no charge.\n\n    c) Convey individual copies of the object code with a copy of the\n    written offer to provide the Corresponding Source.  This\n    alternative is allowed only occasionally and noncommercially, and\n    only if you received the object code with such an offer, in accord\n    with subsection 6b.\n\n    d) Convey the object code by offering access from a designated\n    place (gratis or for a charge), and offer equivalent access to the\n    Corresponding Source in the same way through the same place at no\n    further charge.  You need not require recipients to copy the\n    Corresponding Source along with the object code.  If the place to\n    copy the object code is a network server, the Corresponding Source\n    may be on a different server (operated by you or a third party)\n    that supports equivalent copying facilities, provided you maintain\n    clear directions next to the object code saying where to find the\n    Corresponding Source.  Regardless of what server hosts the\n    Corresponding Source, you remain obligated to ensure that it is\n    available for as long as needed to satisfy these requirements.\n\n    e) Convey the object code using peer-to-peer transmission, provided\n    you inform other peers where the object code and Corresponding\n    Source of the work are being offered to the general public at no\n    charge under subsection 6d.\n\n  A separable portion of the object code, whose source code is excluded\nfrom the Corresponding Source as a System Library, need not be\nincluded in conveying the object code work.\n\n  A \"User Product\" is either (1) a \"consumer product\", which means any\ntangible personal property which is normally used for personal, family,\nor household purposes, or (2) anything designed or sold for incorporation\ninto a dwelling.  In determining whether a product is a consumer product,\ndoubtful cases shall be resolved in favor of coverage.  For a particular\nproduct received by a particular user, \"normally used\" refers to a\ntypical or common use of that class of product, regardless of the status\nof the particular user or of the way in which the particular user\nactually uses, or expects or is expected to use, the product.  A product\nis a consumer product regardless of whether the product has substantial\ncommercial, industrial or non-consumer uses, unless such uses represent\nthe only significant mode of use of the product.\n\n  \"Installation Information\" for a User Product means any methods,\nprocedures, authorization keys, or other information required to install\nand execute modified versions of a covered work in that User Product from\na modified version of its Corresponding Source.  The information must\nsuffice to ensure that the continued functioning of the modified object\ncode is in no case prevented or interfered with solely because\nmodification has been made.\n\n  If you convey an object code work under this section in, or with, or\nspecifically for use in, a User Product, and the conveying occurs as\npart of a transaction in which the right of possession and use of the\nUser Product is transferred to the recipient in perpetuity or for a\nfixed term (regardless of how the transaction is characterized), the\nCorresponding Source conveyed under this section must be accompanied\nby the Installation Information.  But this requirement does not apply\nif neither you nor any third party retains the ability to install\nmodified object code on the User Product (for example, the work has\nbeen installed in ROM).\n\n  The requirement to provide Installation Information does not include a\nrequirement to continue to provide support service, warranty, or updates\nfor a work that has been modified or installed by the recipient, or for\nthe User Product in which it has been modified or installed.  Access to a\nnetwork may be denied when the modification itself materially and\nadversely affects the operation of the network or violates the rules and\nprotocols for communication across the network.\n\n  Corresponding Source conveyed, and Installation Information provided,\nin accord with this section must be in a format that is publicly\ndocumented (and with an implementation available to the public in\nsource code form), and must require no special password or key for\nunpacking, reading or copying.\n\n  7. Additional Terms.\n\n  \"Additional permissions\" are terms that supplement the terms of this\nLicense by making exceptions from one or more of its conditions.\nAdditional permissions that are applicable to the entire Program shall\nbe treated as though they were included in this License, to the extent\nthat they are valid under applicable law.  If additional permissions\napply only to part of the Program, that part may be used separately\nunder those permissions, but the entire Program remains governed by\nthis License without regard to the additional permissions.\n\n  When you convey a copy of a covered work, you may at your option\nremove any additional permissions from that copy, or from any part of\nit.  (Additional permissions may be written to require their own\nremoval in certain cases when you modify the work.)  You may place\nadditional permissions on material, added by you to a covered work,\nfor which you have or can give appropriate copyright permission.\n\n  Notwithstanding any other provision of this License, for material you\nadd to a covered work, you may (if authorized by the copyright holders of\nthat material) supplement the terms of this License with terms:\n\n    a) Disclaiming warranty or limiting liability differently from the\n    terms of sections 15 and 16 of this License; or\n\n    b) Requiring preservation of specified reasonable legal notices or\n    author attributions in that material or in the Appropriate Legal\n    Notices displayed by works containing it; or\n\n    c) Prohibiting misrepresentation of the origin of that material, or\n    requiring that modified versions of such material be marked in\n    reasonable ways as different from the original version; or\n\n    d) Limiting the use for publicity purposes of names of licensors or\n    authors of the material; or\n\n    e) Declining to grant rights under trademark law for use of some\n    trade names, trademarks, or service marks; or\n\n    f) Requiring indemnification of licensors and authors of that\n    material by anyone who conveys the material (or modified versions of\n    it) with contractual assumptions of liability to the recipient, for\n    any liability that these contractual assumptions directly impose on\n    those licensors and authors.\n\n  All other non-permissive additional terms are considered \"further\nrestrictions\" within the meaning of section 10.  If the Program as you\nreceived it, or any part of it, contains a notice stating that it is\ngoverned by this License along with a term that is a further\nrestriction, you may remove that term.  If a license document contains\na further restriction but permits relicensing or conveying under this\nLicense, you may add to a covered work material governed by the terms\nof that license document, provided that the further restriction does\nnot survive such relicensing or conveying.\n\n  If you add terms to a covered work in accord with this section, you\nmust place, in the relevant source files, a statement of the\nadditional terms that apply to those files, or a notice indicating\nwhere to find the applicable terms.\n\n  Additional terms, permissive or non-permissive, may be stated in the\nform of a separately written license, or stated as exceptions;\nthe above requirements apply either way.\n\n  8. Termination.\n\n  You may not propagate or modify a covered work except as expressly\nprovided under this License.  Any attempt otherwise to propagate or\nmodify it is void, and will automatically terminate your rights under\nthis License (including any patent licenses granted under the third\nparagraph of section 11).\n\n  However, if you cease all violation of this License, then your\nlicense from a particular copyright holder is reinstated (a)\nprovisionally, unless and until the copyright holder explicitly and\nfinally terminates your license, and (b) permanently, if the copyright\nholder fails to notify you of the violation by some reasonable means\nprior to 60 days after the cessation.\n\n  Moreover, your license from a particular copyright holder is\nreinstated permanently if the copyright holder notifies you of the\nviolation by some reasonable means, this is the first time you have\nreceived notice of violation of this License (for any work) from that\ncopyright holder, and you cure the violation prior to 30 days after\nyour receipt of the notice.\n\n  Termination of your rights under this section does not terminate the\nlicenses of parties who have received copies or rights from you under\nthis License.  If your rights have been terminated and not permanently\nreinstated, you do not qualify to receive new licenses for the same\nmaterial under section 10.\n\n  9. Acceptance Not Required for Having Copies.\n\n  You are not required to accept this License in order to receive or\nrun a copy of the Program.  Ancillary propagation of a covered work\noccurring solely as a consequence of using peer-to-peer transmission\nto receive a copy likewise does not require acceptance.  However,\nnothing other than this License grants you permission to propagate or\nmodify any covered work.  These actions infringe copyright if you do\nnot accept this License.  Therefore, by modifying or propagating a\ncovered work, you indicate your acceptance of this License to do so.\n\n  10. Automatic Licensing of Downstream Recipients.\n\n  Each time you convey a covered work, the recipient automatically\nreceives a license from the original licensors, to run, modify and\npropagate that work, subject to this License.  You are not responsible\nfor enforcing compliance by third parties with this License.\n\n  An \"entity transaction\" is a transaction transferring control of an\norganization, or substantially all assets of one, or subdividing an\norganization, or merging organizations.  If propagation of a covered\nwork results from an entity transaction, each party to that\ntransaction who receives a copy of the work also receives whatever\nlicenses to the work the party's predecessor in interest had or could\ngive under the previous paragraph, plus a right to possession of the\nCorresponding Source of the work from the predecessor in interest, if\nthe predecessor has it or can get it with reasonable efforts.\n\n  You may not impose any further restrictions on the exercise of the\nrights granted or affirmed under this License.  For example, you may\nnot impose a license fee, royalty, or other charge for exercise of\nrights granted under this License, and you may not initiate litigation\n(including a cross-claim or counterclaim in a lawsuit) alleging that\nany patent claim is infringed by making, using, selling, offering for\nsale, or importing the Program or any portion of it.\n\n  11. Patents.\n\n  A \"contributor\" is a copyright holder who authorizes use under this\nLicense of the Program or a work on which the Program is based.  The\nwork thus licensed is called the contributor's \"contributor version\".\n\n  A contributor's \"essential patent claims\" are all patent claims\nowned or controlled by the contributor, whether already acquired or\nhereafter acquired, that would be infringed by some manner, permitted\nby this License, of making, using, or selling its contributor version,\nbut do not include claims that would be infringed only as a\nconsequence of further modification of the contributor version.  For\npurposes of this definition, \"control\" includes the right to grant\npatent sublicenses in a manner consistent with the requirements of\nthis License.\n\n  Each contributor grants you a non-exclusive, worldwide, royalty-free\npatent license under the contributor's essential patent claims, to\nmake, use, sell, offer for sale, import and otherwise run, modify and\npropagate the contents of its contributor version.\n\n  In the following three paragraphs, a \"patent license\" is any express\nagreement or commitment, however denominated, not to enforce a patent\n(such as an express permission to practice a patent or covenant not to\nsue for patent infringement).  To \"grant\" such a patent license to a\nparty means to make such an agreement or commitment not to enforce a\npatent against the party.\n\n  If you convey a covered work, knowingly relying on a patent license,\nand the Corresponding Source of the work is not available for anyone\nto copy, free of charge and under the terms of this License, through a\npublicly available network server or other readily accessible means,\nthen you must either (1) cause the Corresponding Source to be so\navailable, or (2) arrange to deprive yourself of the benefit of the\npatent license for this particular work, or (3) arrange, in a manner\nconsistent with the requirements of this License, to extend the patent\nlicense to downstream recipients.  \"Knowingly relying\" means you have\nactual knowledge that, but for the patent license, your conveying the\ncovered work in a country, or your recipient's use of the covered work\nin a country, would infringe one or more identifiable patents in that\ncountry that you have reason to believe are valid.\n\n  If, pursuant to or in connection with a single transaction or\narrangement, you convey, or propagate by procuring conveyance of, a\ncovered work, and grant a patent license to some of the parties\nreceiving the covered work authorizing them to use, propagate, modify\nor convey a specific copy of the covered work, then the patent license\nyou grant is automatically extended to all recipients of the covered\nwork and works based on it.\n\n  A patent license is \"discriminatory\" if it does not include within\nthe scope of its coverage, prohibits the exercise of, or is\nconditioned on the non-exercise of one or more of the rights that are\nspecifically granted under this License.  You may not convey a covered\nwork if you are a party to an arrangement with a third party that is\nin the business of distributing software, under which you make payment\nto the third party based on the extent of your activity of conveying\nthe work, and under which the third party grants, to any of the\nparties who would receive the covered work from you, a discriminatory\npatent license (a) in connection with copies of the covered work\nconveyed by you (or copies made from those copies), or (b) primarily\nfor and in connection with specific products or compilations that\ncontain the covered work, unless you entered into that arrangement,\nor that patent license was granted, prior to 28 March 2007.\n\n  Nothing in this License shall be construed as excluding or limiting\nany implied license or other defenses to infringement that may\notherwise be available to you under applicable patent law.\n\n  12. No Surrender of Others' Freedom.\n\n  If conditions are imposed on you (whether by court order, agreement or\notherwise) that contradict the conditions of this License, they do not\nexcuse you from the conditions of this License.  If you cannot convey a\ncovered work so as to satisfy simultaneously your obligations under this\nLicense and any other pertinent obligations, then as a consequence you may\nnot convey it at all.  For example, if you agree to terms that obligate you\nto collect a royalty for further conveying from those to whom you convey\nthe Program, the only way you could satisfy both those terms and this\nLicense would be to refrain entirely from conveying the Program.\n\n  13. Use with the GNU Affero General Public License.\n\n  Notwithstanding any other provision of this License, you have\npermission to link or combine any covered work with a work licensed\nunder version 3 of the GNU Affero General Public License into a single\ncombined work, and to convey the resulting work.  The terms of this\nLicense will continue to apply to the part which is the covered work,\nbut the special requirements of the GNU Affero General Public License,\nsection 13, concerning interaction through a network will apply to the\ncombination as such.\n\n  14. Revised Versions of this License.\n\n  The Free Software Foundation may publish revised and/or new versions of\nthe GNU General Public License from time to time.  Such new versions will\nbe similar in spirit to the present version, but may differ in detail to\naddress new problems or concerns.\n\n  Each version is given a distinguishing version number.  If the\nProgram specifies that a certain numbered version of the GNU General\nPublic License \"or any later version\" applies to it, you have the\noption of following the terms and conditions either of that numbered\nversion or of any later version published by the Free Software\nFoundation.  If the Program does not specify a version number of the\nGNU General Public License, you may choose any version ever published\nby the Free Software Foundation.\n\n  If the Program specifies that a proxy can decide which future\nversions of the GNU General Public License can be used, that proxy's\npublic statement of acceptance of a version permanently authorizes you\nto choose that version for the Program.\n\n  Later license versions may give you additional or different\npermissions.  However, no additional obligations are imposed on any\nauthor or copyright holder as a result of your choosing to follow a\nlater version.\n\n  15. Disclaimer of Warranty.\n\n  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY\nAPPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT\nHOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM \"AS IS\" WITHOUT WARRANTY\nOF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\nPURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM\nIS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF\nALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n  16. Limitation of Liability.\n\n  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\nWILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS\nTHE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY\nGENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE\nUSE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF\nDATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD\nPARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),\nEVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF\nSUCH DAMAGES.\n\n  17. Interpretation of Sections 15 and 16.\n\n  If the disclaimer of warranty and limitation of liability provided\nabove cannot be given local legal effect according to their terms,\nreviewing courts shall apply local law that most closely approximates\nan absolute waiver of all civil liability in connection with the\nProgram, unless a warranty or assumption of liability accompanies a\ncopy of the Program in return for a fee.\n\n                     END OF TERMS AND CONDITIONS\n\n            How to Apply These Terms to Your New Programs\n\n  If you develop a new program, and you want it to be of the greatest\npossible use to the public, the best way to achieve this is to make it\nfree software which everyone can redistribute and change under these terms.\n\n  To do so, attach the following notices to the program.  It is safest\nto attach them to the start of each source file to most effectively\nstate the exclusion of warranty; and each file should have at least\nthe \"copyright\" line and a pointer to where the full notice is found.\n\n    <one line to give the program's name and a brief idea of what it does.>\n    Copyright (C) <year>  <name of author>\n\n    This program is free software: you can redistribute it and/or modify\n    it under the terms of the GNU General Public License as published by\n    the Free Software Foundation, either version 3 of the License, or\n    (at your option) any later version.\n\n    This program is distributed in the hope that it will be useful,\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n    GNU General Public License for more details.\n\n    You should have received a copy of the GNU General Public License\n    along with this program.  If not, see <http://www.gnu.org/licenses/>.\n\nAlso add information on how to contact you by electronic and paper mail.\n\n  If the program does terminal interaction, make it output a short\nnotice like this when it starts in an interactive mode:\n\n    <program>  Copyright (C) <year>  <name of author>\n    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.\n    This is free software, and you are welcome to redistribute it\n    under certain conditions; type `show c' for details.\n\nThe hypothetical commands `show w' and `show c' should show the appropriate\nparts of the General Public License.  Of course, your program's commands\nmight be different; for a GUI interface, you would use an \"about box\".\n\n  You should also get your employer (if you work as a programmer) or school,\nif any, to sign a \"copyright disclaimer\" for the program, if necessary.\nFor more information on this, and how to apply and follow the GNU GPL, see\n<http://www.gnu.org/licenses/>.\n\n  The GNU General Public License does not permit incorporating your program\ninto proprietary programs.  If your program is a subroutine library, you\nmay consider it more useful to permit linking proprietary applications with\nthe library.  If this is what you want to do, use the GNU Lesser General\nPublic License instead of this License.  But first, please read\n<http://www.gnu.org/philosophy/why-not-lgpl.html>."
  },
  {
    "path": "optimization/chatllama/README.md",
    "content": "# **🦙 ChatLLaMA**\r\n\r\n> :warning: Please note this library does NOT contain LLaMA’s weights; to access the weights, you need to apply to Meta's form.\r\n\r\n`ChatLLaMA` 🦙 is a library that allows you to efficiently leverage LLMs fine-tuning capabilities using your own data and the least amount of compute possible. \r\nIts purpose is to give developers peace of mind, by abstracting the efforts required for computational optimization and for the collection of large amounts of data.\r\n\r\nIf you like the project, please show your support by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers).\r\n\r\n## Quick install\r\nYou can install the package with pip:\r\n```bash\r\npip install chatllama-py\r\n```\r\nThen you need to install the Llama models cloned from [Meta's repository](https://github.com/facebookresearch/llama):\r\n```bash\r\ngit clone https://github.com/facebookresearch/llama.git\r\ncd llama\r\npip install -r requirements.txt\r\npip install -e .\r\n```\r\nFollow the instructions in the Llama repository to download the model weights and tokenizer.\r\n\r\n## What can ChatLLaMA help with?\r\n\r\n`ChatLLaMA` 🦙 has been designed to help developers with various use cases, all related to RLHF training and optimized inference. These are some of the use cases that better resonate with our community wishlist:\r\n\r\n- I want to train an efficient ChatGPT-like assistant on my local hardware infrastructure using a limited amount of data;\r\n- I want to create my own personalized version of ChatGPT-like assistant without costs getting out of control;\r\n- I want to understand which model architecture (LLaMA, OPT, GPTJ, etc.) best fits my requirements in terms of hardware, compute budget, and performance;\r\n\r\n## Getting started\r\n\r\nIn this Getting Started we will set up a local RLHF training that will allow you to create your own ChatGPT-like assistant. In this example, we used OPT-1.3B, wherever possible we used open-source datasets and ran the training on a NVIDIA A100. If you want to use other models or hardware, we recommend reading the [supported models](#supported-models), [hardware requirements](#hardware-requirements) and [dataset preparation](#dataset-preparation) sections. In this example, we ran a few epochs of the training; this took a few hours. Any feedback on total training time, on any hardware, would be greatly appreciated. Please share your experience with our community on our Discord channel.\r\n\r\nTo quickly get you started, we will focus on 3 key steps:\r\n\r\n1. Download YAML files to customize your training process. Please note that all the parameters of the library can be managed in the [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml);\r\n2. Prepare the 3 datasets needed to train the actor model, the reward model and perform RLHF;\r\n3. Train the models on your local infrastructure.\r\n\r\n<details>\r\n<summary>1 - YAML download </summary>\r\nFirst, let’s get the artifacts for running ChatLLaMA. The artifacts contain:\r\n\r\n- [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml): config file for model and data set. This allows you to 1) select the model you prefer (LLaMA, OPT, BLOOM, etc) 2) change all the hyperparameters of the training process;\r\n- [`ds_config.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/ds_config.json): config file to define DeepSpeed training parameters;\r\n- [`peft_config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/peft_config.yaml): config file to define PEFT parameters; PEFT is used for efficient training with Hugging Face models. It can be used for setting the LoRA parameters as rank and precision.\r\n\r\n- [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json): synthetic data generation templates that can be used to personalize the creation of the dataset. The templates are used for feeding LLMs during the data generation. Note that the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file contains a dictionary having as *keys* the training steps (`actor`, `reward`, `rlhf`) and as *values* a string containing the personalization requests of the user. For more details see the [dataset preparation](#dataset-preparation) section;\r\n- [`main.py`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/main.py): file to train the model.\r\n        \r\n```bash\r\nwget -O artifacts.zip https://nbllabartifacts.blob.core.windows.net/chatllama/artifacts.zip\\?sp\\=r\\&st\\=2023-03-08T14:53:24Z\\&se\\=2100-03-08T22:53:24Z\\&spr\\=https\\&sv\\=2021-06-08\\&sr\\=b\\&sig\\=jqr%2B2ZkR0SW9RjV0pDOdQ%2BDulLXLjbZ36vmNd4XxxyQ%3D\r\nunzip artifacts.zip \r\n```\r\n        \r\nOnce you have run the command above, you will find the all artificats in the [`artifacts/`](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama/artifacts) directory. Now you can move on to the next section regarding the dataset preparation.\r\n\r\n</details>\r\n\r\n<details>\r\n<summary> 2 - Dataset preparation </summary>\r\n    \r\nBefore training the model, we need to prepare 3 datasets:\r\n\r\n- `actor_training_data`: this is the JSON dataset used in the supervised fine-tuning. It consists of examples of unlabelled conversations, e.g. collection of prompts and responses;\r\n- `rlhf_training_data`: this is the JSON dataset used for RLHF training. It consists of a collection of possible input user prompts;\r\n- `reward_training_data`: this is the JSON dataset used to train the reward model. It consists of responses with associated scores.\r\n\r\nIn this example, we are using only publicly available dataset and synthetic generation; if you want to use your own data instead, please see the [Dataset preparation](#dataset-preparation) section.\r\n\r\nFirst, let’s download the `actor_training_data` and the `rlhf_training_data`: \r\n\r\n```bash\r\npython artifacts/download_dataset.py ARLHF --path ./datasets --number_of_samples 200\r\n```\r\n\r\nFinally, let’s create the `reward_training_data` using `davinci-003` for synthetic data generation.\r\n\r\n```bash\r\nexport OPENAI_API_KEY=YOUR_API_KEY\r\npython artifacts/generate_rewards.py ./datasets/reward_training_data.json\r\n```\r\n\r\n> :warning: Creating the `reward_training_data` with `davinci-003` is not free, i.e. it costs a few $$. If you prefer avoiding external paid APIs, we suggest using HuggingFace’s models (e.g. flan_t5_xl) as described in more detail in the [Supported models](#supported-models) section.\r\n> \r\n> :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to \"use the Services to develop foundation models or other large scale models that compete with OpenAI\".\r\n\r\nAt this point, we have successfully created the 3 datasets. We can therefore move on to the final section and start the training.\r\n\r\n</details>\r\n\r\n<details>\r\n<summary> 3 - Training </summary>\r\n    \r\nYou can train the 3 models in separate steps:\r\n\r\n- Train the Reward Model\r\n\r\n    ```bash\r\n    python artifacts/main.py artifacts/config/config.yaml --type REWARD\r\n    ```\r\n\r\n- Pre-Train the Actor Model\r\n\r\n    ```bash\r\n    python artifacts/main.py artifacts/config/config.yaml --type ACTOR\r\n    ```\r\n\r\n- Training the Actor with reinforcement learning.\r\n\r\n    ```bash\r\n    python artifacts/main.py artifacts/config/config.yaml --type RL\r\n    ```\r\n\r\n\r\nor, equivantly, the 3 trainings can also be pipelined using the flag ALL.\r\n\r\n```bash\r\npython artifacts/main.py artifacts/config/config.yaml --type ALL\r\n```\r\n\r\nNote that the path to the datasets and the training hyper-parameters of the training process are specified in the [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml) file.\r\n\r\n</details>\r\n\r\n## Contributing and Roadmap\r\n\r\nAs an open source project in a rapidly evolving field, we welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see our [Roadmap page](https://github.com/users/nebuly-ai/projects/1/views/1) for more information on how to get involved.\r\n\r\nYou can participate in the following ways:\r\n\r\n1. Submit an issue or PR on GitHub\r\n2. Join our [Discord group](https://discord.gg/77d5kGSa8e) to chat\r\n\r\n## Supported models\r\n\r\n<details><summary><b><i> Actor models </i></b></summary>\r\n\r\nWe support models that can be run efficiently with a limited amount of compute, such as LLaMA and 🤗 transformers. These are the models with less than 20B parameters currently supported :\r\n\r\n- LLaMA: 7B and 13B, please note this library does NOT contain LLaMA’s weights; to access the weights, you need to apply to Meta's [form](https://forms.gle/jk851eBVbX1m5TAv5).\r\n- GPTJ: 6B\r\n- GPTNeoX: 1.3B, 20B\r\n- **(⚠️WIP)** Flan-T5: 80M, 259M, 780M, 3B, 11B\r\n- OPT: 125M, 359M, 1.3B, 2.7B, 6.7B, 13B\r\n- BLOOM: 560M, 1.1B, 1.7B, 3B, 7.1B\r\n- BLOOMZ: 560M, 1.1B, 1.7B, 3B, 7.1B\r\n- Galactica: 125M, 1.3B, 6.7B\r\n</details>\r\n\r\n<details><summary><b><i> Reward models </i></b></summary>\r\n\r\nWe suggest using models under 6B from 🤗 transformers: \r\n\r\n- GPT2: 124M, 355M, 774M, 1.5B\r\n- OPT: 125M, 359M, 1.3B, 2.7B\r\n- GPTJ: 6B\r\n- BLOOMZ: 560M, 1.1B, 1.7B, 3B\r\n- **(⚠️WIP)** OpenAssistant [pre-trained reward models](https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large-v2)\r\n</details>\r\n\r\n<details>\r\n<summary><b><i> Synthetic data generation models </i></b></summary>\r\n\r\nWe support both APIs from OpenAI and  🤗 transformers:\r\n\r\n- OpenAI: da-vinci-003, gpt-3.5-turbo **(⚠️WIP)**\r\n- HuggingFace: Flan-T5 (3B and 11B)\r\n\r\n> :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to \"use the Services to develop foundation models or other large scale models that compete with OpenAI\".\r\n\r\n:watninh\r\n\r\nIf you need support for different models, please open an issue and we will get to work.\r\n</details>\r\n\r\n## Hardware requirements\r\n\r\n<details><summary><b><i> Training </i></b></summary>\r\n\r\nLarger actor models require more powerful hardware. Here is a rough hardware recommendation table, suggesting the right type of hardware for different actor model sizes:\r\n\r\n- 125M to 1.3B → 1x Nvidia 3090/4090\r\n- 1.3B to 3B → 1x Nvidia A100 (80Gb)\r\n- 3B with DeepSpeed CPU off-loading → 1x Nvidia 3090/4090\r\n- 3B to 7B with DeepSpeed ZeRO → 4x Nvidia T4\r\n- 3B to 13B → 4x Nvidia A100 (80Gb)\r\n- 13B to 20B with DeepSpeed ZeRO → 4x Nvidia A100 (80Gb)\r\n- 13B to 20B → 8x Nvidia A100 (80Gb)\r\n</details>\r\n\r\n<details><summary><b><i> Inference </i></b></summary>\r\n\r\n**(⚠️WIP)** When it comes to inference optimization, ChatLLaMA will support the following optimization techniques:\r\n\r\n- [ ]  DeepSpeed ZeRO\r\n- [ ]  FlexGen\r\n- [ ]  HF Accelerate\r\n- [ ]  PyTorch Vanilla\r\n</details>\r\n\r\nPlease note that inference optimization has yet to be implemented. If you would like to contribute, please see the **issue roadmap**, community contributions are always welcome 😊.\r\n\r\n## Dataset preparation\r\n\r\nTo successfully train a ChatLLaMA assistant, you need 3 different datasets: `actor_training_data`, `rlhf_training_data` and `reward_training_data`.\r\n\r\n<details>\r\n<summary> Dataset for supervised fine-tuning of the actor model </summary>\r\n    \r\nThe `actor_training_data` is a collection of prompts with the associated responses as highlighted below:\r\n\r\n```json\r\n[\r\n  {\r\n      \"user_input\": \"here the input of the user\",\r\n      \"completion\": \"here the model completion\"\r\n  }\r\n]\r\n```\r\n\r\nChatLLaMA supports 4 different options to prepare the `actor_training_data`:\r\n\r\n* <details><summary> Use 100% synthetic data </summary>\r\n\r\n  The dataset can be synthetically generated by running the following command:\r\n\r\n  ```bash\r\n  python artifacts/generate_actor_dataset.py\r\n  ```\r\n\r\n  > :warning: Note that this command will require a subscription to OpenAI. Generating the full dataset with `davinci-003` could cost approximately ~200$.\r\n  > \r\n  > :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to \"use the Services to develop foundation models or other large scale models that compete with OpenAI\".\r\n\r\n  Alternatively, you can generate the dataset for free using 🤗 tranformers as described in the section [Supported models](#supported-models).\r\n  </details>\r\n  \r\n* <details><summary> Use one of the open source datasets with assistant interactions </summary>\r\n\r\n  Currently, we support:\r\n\r\n  - [Anthropic HH RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf): this dataset consists of structured question/answer pairs with an LLM chatbot that includes selected and rejected answers;\r\n  - [Stanford Human Preferences Dataset (SHP)](https://huggingface.co/datasets/stanfordnlp/SHP): this dataset is curated from selected \"ask\" subreddits, and includes questions that span a wide range of question/answer pairs based on the most upvoted responses. Please note that, unlike HH RLHF, this dataset is not intended to reduce harassment by selecting the ideal chatbot response, but instead weights the most helpful human responses.\r\n\r\n  The datasets can be downloaded running the following command:\r\n\r\n  ```bash\r\n  python artifacts/download_dataset.py <dataset_name> --path <path_to_folder_for_download> --number_of_samples <N>\r\n  ```\r\n\r\n  Where: \r\n\r\n  - `<dataset_name>` could be \"SHP\" for the StanfordNLP/SHP dataset or \"ARLHF\" for the Anthropic/hh-rlhf dataset;\r\n  - `<path_to_folder_for_download>` is the folder path to where the datasets are going to be created;\r\n  - `<N>` is the number of samples of which the reward_dataset.json is composed.\r\n  </details>\r\n  \r\n  \r\n* <details><summary> Use 100% personalized dataset </summary>\r\n\r\n  The user provides his own personalized full dataset. Datasets must be JSON files with the following format:\r\n\r\n  ```\r\n  [\r\n      {\r\n          \"user_input\": \"here the input of the user\",\r\n          \"completion\": \"here the model completion\"\r\n      }\r\n  ]\r\n  ```\r\n\r\n  Where the list contains multiple dictionaries, and each dictionary corresponds to a data sample. We suggest using more than 1000 data samples to run the actor training.\r\n  </details>\r\n\r\n* <details><summary> (⚠️WIP) Create the full dataset augmenting few custom data samples </summary>\r\n\r\n  The dataset can be generated synthetically from a few prompt+response examples provided by the user (few =>10).\r\n  </details>\r\n</details>\r\n\r\n<details>\r\n<summary> Dataset for RLHF </summary>\r\n    \r\nThe dataset for RLHF consists just of prompt examples:\r\n\r\n```json\r\n[\r\n    {\r\n        \"user_input\": \"here the example of user input\"\r\n    }\r\n]\r\n```\r\n\r\nIt can be provided in 2 different ways:\r\n\r\n* <details><summary> Few examples provided by the user and dataset synthetically expanded using LLM </summary>\r\n\r\n    You need to add the key `rlhf` to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file with the information about the task you want to perform and extra context needed by the LLM for the generation. Here is an example of template:\r\n\r\n    ```json\r\n    {\r\n      \"rlhf\": \"Here is the template for the generating RLHF prompts. The task we want to perform is ...\"\r\n    }\r\n    ```\r\n\r\n     *Note that all templates must be saved in a single JSON file named [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json)*\r\n     </details>\r\n\r\n* <details><summary> The user provides the full dataset with possible interactions with the model </summary>\r\n\r\n    The dataset needs to contain more than 1000 prompt examples:\r\n\r\n    ```json\r\n    [\r\n        {\r\n            \"user_input\": \"here the example of user input\"\r\n        }\r\n    ]\r\n    ```\r\n\r\n    The file must be named `rlhf_training_data.json`.\r\n    </details>\r\n</details>\r\n<details>\r\n<summary><b> Dataset to train the reward model </b></summary>\r\n\r\nThe `reward_training_data` is a collection of i) prompts, ii) completion and iii) score of the completion assigned accordingly to the user feedback (the Human Feedback in RLHF). \r\n\r\n```json\r\n[{\r\n\t\"user_input\": \"...\",\r\n\t\"completion\": \"...\",\r\n\t\"score\": 1\r\n},\r\n\t...\r\n]\r\n```\r\n\r\nWe support 3 different options to prepare the `reward_training_data`: \r\n\r\n- Fully Synthetic Score Generation\r\n    \r\n    In this case the reward dataset can be synthetically scored using a LLM as Human Feedback. We recommend the `reward_training_data` having at least 100 data samples.\r\n    \r\n    ```json\r\n    [{\r\n    \t\"user_input\": \"...\",\r\n    \t\"completion\": \"...\",\r\n    \t\"score\": None\r\n    },\r\n    \t...\r\n    ]\r\n    ```\r\n    \r\n    A LLM model is used to assign the score to each entry. \r\n    \r\n    The LLM needs a prompt template containing all the instructions to evaluate the generated text. To do this, you should add the key `reward` to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file. Here is an example:\r\n    \r\n    ```json\r\n    {\r\n    \t\"reward\": \"Here is the template for the reward model. The rules are:\\n\\n1.Rule 1\\n\\n2. Rule 2\"\r\n    }\r\n    ```\r\n    \r\n    If no template is provided the default one is used. You can find the default template in `artifacts/generate_rewards.py`. Note that all templates must be saved in a single JSON file named [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json). \r\n    \r\n    Once you have the unlabelled dataset, you can generate the scores by running the following command:\r\n    \r\n    ```bash\r\n    python artifacts/generate_rewards.py <dataset_path> --model <model_to_use> --temperature <t> --max_tokens <n> --reward_template <path_to_file.json>\r\n    ```\r\n    \r\n    Where:\r\n    \r\n    - `<dataset_path>` path to the reward dataset to be scored;\r\n    - `<model_to_use>` model to use for the reward. Default and suggested text-davinci-003 (More to come);\r\n    - `<temperature>` temperature used to score the model; temperature=0.1;\r\n    - `<max_tokens>` max_tokens of the generation;\r\n    - `<reward_template>` is the path to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file containing the template to be used for generating the reward. If no path is provided, the default template will be used.\r\n- The user provides their personalized full dataset\r\n    \r\n    Datasets must be JSON files in the following format:\r\n    \r\n    ```json\r\n    [\r\n        {\r\n            \"user_input\": \"here type the user input\",\r\n            \"completion\": \"here type the completion\",\r\n            \"score\": 4.0\r\n        },\r\n        {\r\n            \"user_input\": \"here type the user input\",\r\n            \"completion\": \"random garbage\",\r\n            \"score\": 0.0\r\n        }\r\n    ]\r\n    ```\r\n    \r\n    Note that at least 100 data samples are required in this case. The file must be named `reward_training_data.json`\r\n    \r\n- **(⚠️WIP)** Few examples provided by the user and dataset synthetically expanded using LLM\r\n</details>\r\n\r\n# License\r\n\r\nSee the [LICENSE](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/LICENSE) file.\r\n"
  },
  {
    "path": "optimization/chatllama/artifacts/config/config.yaml",
    "content": "---\ntrainer_config:\n  # learning rates\n  actor_lr: 0.000005\n  critic_lr: 0.000009\n  # PPO Hyperparameters\n  actor_eps_clip: 0.2\n  critic_eps_clip: 0.2\n  beta_s: 0.02\n  # coefficient for the discounted rewards\n  gamma_discounted: 1 \n  # path to examples to be sampled (training dataset) see rlhf_dataset.json\n  examples_path: \"./datasets/rlhf_training_data.json\"\n  # number of episodes and generation performed for each episode\n  # in the train() method\n  num_episodes: 100\n  max_timesteps: 32\n  # number of timesteps after which the learn() method is called \n  # (to update the weights)\n  update_timesteps: 32\n  # number of example sampled at each timestep\n  num_examples: 1\n  # batch and epochs for the training\n  batch_size: 1\n  epochs: 1\n  # number of episodes after which update the checkpoints in RL training\n  checkpoint_steps: 1000\n  # here specify the name of the actor_rl checkpoint from which resume \n  # during actor RL training. If null load the last one.\n  checkpoint_name: null\n\nactor_config:\n  model: \"facebook/opt-1.3b\"\n  model_folder: \"./models\"\n  tokenizer_path: \"path-to-tokenizer\"\n  train_dataset_path: \"./datasets/actor_training_data.json\"\n  validation_dataset_path: null\n  # froze model embedding during training\n  froze_embeddings: True\n  # use fairscale layers to build the model instead of vanilla pytorch\n  # only for llama\n  use_fairscale: False\n  # max sequence length for the actor (i.e. prompt + completion) it depends on\n  # the model used.\n  max_sequence_length: 2048\n  # max tokens generated by the actor (completion only)\n  max_tokens: 2048\n  # minimum number of tokens generated by the actor\n  min_tokens: 100\n  # additional prompt tokens to be used for template or as safety\n  additonal_prompt_tokens: 20\n  # temperature for the actor\n  temperature: 0.1\n  batch_size: 2\n  # number iteration after print\n  iteration_per_print: 1\n  lr: 0.000009\n  epochs: 1\n  # number of backpropagation after saving the checkpoints\n  checkpoint_steps: 5000\n  # number of checkpoints to keep while removing the older \n  # (keep memory consumption of checkpoints reasonable)\n  n_checkpoints_to_keep: 5\n  # here specify the name of the actor checkpoint from which resume \n  # during actor training. If null load the last one.\n  checkpoint_name: null\n  # deepspeed settings\n  deepspeed_enable: False\n  deepspeed_config_path: \"./artifacts/config/ds_config.json\"\n  # accelerate settings\n  accelerate_enable: False\n  # use_peft - the parameters of PEFT can be modified in the peft_config.yaml\n  peft_enable: False\n  peft_config_path: \"./artifacts/config/peft_config.yaml\"\n\nreward_config:\n  # model to be chosen are gp2-large, bart-base, longformer-base-4096\n  # more can be simply added in the reward.py __init__()\n  model: \"facebook/opt-125m\"\n  model_folder: \"./models\"\n  # hidden size of the additional ffw head to produce the scores\n  model_head_hidden_size: 2048\n  max_sequence_length: 2048\n  train_dataset_path: \"./datasets/reward_training_data.json\"\n  validation_dataset_path: null\n  batch_size: 8\n  epochs: 1\n  iteration_per_print: 1\n  # steps after which the checkpoint are saved\n  checkpoint_steps: 10000\n  # here specify the name of the reward checkpoint from which resume \n  # during reward training. If null load the last one.\n  checkpoint_name: null\n  lr: 0.000009\n  # deepspeed settings\n  deepspeed_enable: False\n  deepspeed_config_path: \"./artifacts/config/ds_config.json\"\n  # accelerate settings\n  accelerate_enable: False\n\ncritic_config:\n  # model to be chosen are gp2-large, bart-base, longformer-base-4096\n  # more can be simply added in the reward.py __init__()\n  model: \"facebook/opt-125m\"\n  # hidden size of the additional ffw head to produce the scores\n  model_head_hidden_size: 2048\n  max_sequence_length: 2048\n  model_folder: \"./models\"\n  # here specify the name of the critic checkpoint from which resume \n  # during critic training. If null load the last one.\n  checkpoint_name: null\n"
  },
  {
    "path": "optimization/chatllama/artifacts/config/ds_config.json",
    "content": "{\n    \"train_batch_size\": 8,\n    \"gradient_accumulation_steps\": 1,\n    \"optimizer\": {\n      \"type\": \"Adam\",\n      \"params\": {\n        \"lr\": 0.00015\n      }\n    },\n    \"fp16\": {\n      \"enabled\": false,\n      \"auto_cast\": false,\n      \"loss_scale\": 0,\n      \"initial_scale_power\": 16,\n      \"loss_scale_window\": 1000,\n      \"hysteresis\": 2,\n      \"min_loss_scale\": 1\n  },\n  \"zero_optimization\": {\n    \"stage\": 2,\n    \"allgather_partitions\": true,\n    \"allgather_bucket_size\": 5e8,\n    \"overlap_comm\": false,\n    \"reduce_scatter\": true,\n    \"reduce_bucket_size\": 5e8,\n    \"contiguous_gradients\" : true,\n    \"offload_param\": {\n      \"device\": \"cpu\",\n      \"nvme_path\": \"/local_nvme\",\n      \"pin_memory\": true,\n      \"buffer_count\": 5,\n      \"buffer_size\": 1e8,\n      \"max_in_cpu\": 1e9\n    },\n    \"offload_optimizer\": {\n      \"device\": \"cpu\",\n      \"nvme_path\": \"/local_nvme\",\n      \"pin_memory\": true,\n      \"buffer_count\": 4,\n      \"fast_init\": false\n    },\n    \"stage3_max_live_parameters\" : 1e9,\n    \"stage3_max_reuse_distance\" : 1e9,\n    \"stage3_prefetch_bucket_size\" : 5e8,\n    \"stage3_param_persistence_threshold\" : 1e6,\n    \"sub_group_size\" : 1e12,\n    \"elastic_checkpoint\" : true,\n    \"stage3_gather_16bit_weights_on_model_save\": true,\n    \"ignore_unused_parameters\": true,\n    \"round_robin_gradients\": true\n    }\n  }"
  },
  {
    "path": "optimization/chatllama/artifacts/config/peft_config.yaml",
    "content": "---\ninference_mode: False\nr: 8\nlora_alpha: 32\nlora_dropout: 0.1\n"
  },
  {
    "path": "optimization/chatllama/artifacts/datasets/actor_dataset.json",
    "content": "[\n    {\n        \"user_input\": \"here the input of the user\",\n        \"completion\": \"here the model completion\"\n    }\n]"
  },
  {
    "path": "optimization/chatllama/artifacts/datasets/reward_dataset.json",
    "content": "[\n    {\n        \"user_input\": \"here type the user input\",\n        \"completion\": \"here type the completion\",\n        \"score\": 4.0\n    },\n    {\n        \"user_input\": \"here type the user input\",\n        \"completion\": \"if score is null, it can be evaluated by davinci using reward_trainer.distill()\",\n        \"score\": null \n    }\n]\n"
  },
  {
    "path": "optimization/chatllama/artifacts/datasets/rlhf_dataset.json",
    "content": "[\n    {\n        \"user_input\": \"here the example of user input\"\n    }\n]"
  },
  {
    "path": "optimization/chatllama/artifacts/download_dataset.py",
    "content": "import argparse\nimport os\n\nfrom chatllama.rlhf.dataset import AnthropicRLHF, StanfordNLPSHPDataset\n\n\nif __name__ == \"__main__\":\n\n    # Setup argument parser\n    parser = argparse.ArgumentParser(\n        prog=\"generate_rewards.py\",\n        description=\"Generate rewards using LangChain and LLMs\",\n    )\n\n    parser.add_argument(\n        \"dataset_name\",\n        help=\"dataset name it can be. SSHP: stanfordnlp/SHP or \",\n        choices=[\"SHP\", \"ARLHF\"],\n    )\n    parser.add_argument(\n        \"-p\",\n        \"--path\",\n        help=\"Specify the path for the dataset\",\n        default=\"./datasets\",\n    )\n    parser.add_argument(\n        \"-n\",\n        \"--number_of_samples\",\n        help=\"Specify the number of samples for the reward dataset\",\n        default=200,\n    )\n\n    args = parser.parse_args()\n    if os.path.exists(args.path) is False:\n        os.mkdir(args.path)\n\n    try:\n        n_samples = int(args.number_of_samples)\n    except ValueError:\n        raise ValueError(\"Number of samples should be an integer\")\n\n    if args.dataset_name == \"SHP\":\n        dataset = StanfordNLPSHPDataset()\n        dataset.save_dataset(args.path, n_samples)\n\n    elif args.dataset_name == \"ARLHF\":\n        dataset = AnthropicRLHF()\n        dataset.save_dataset(\n            args.path,\n            n_samples,\n        )\n"
  },
  {
    "path": "optimization/chatllama/artifacts/extend_rlhf_dataset.py",
    "content": "import os.path\n\nimport numpy as np\nfrom langchain import OpenAI, LLMChain, PromptTemplate\nfrom transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n\n\ndef _get_template_and_variables(prompt: str, with_examples: bool):\n    if with_examples:\n        template = prompt + \"\\n\\nExample: {example}\"\n        variables = [\"example\"]\n    else:\n        template = prompt\n        variables = []\n    return template, variables\n\n\ndef use_langchain_model(\n    user_prompt: str,\n    model_name: str,\n    temperature: float = 0.7,\n    max_tokens: int = 2048,\n    with_examples: bool = False,\n) -> LLMChain:\n    llm = OpenAI(\n        model_name=model_name, temperature=temperature, max_tokens=max_tokens\n    )\n    template, input_variables = _get_template_and_variables(\n        user_prompt, with_examples=with_examples\n    )\n    prompt_template = PromptTemplate(\n        template=template,\n        input_variables=input_variables,\n    )\n\n    return LLMChain(llm=llm, prompt=prompt_template)\n\n\nclass HuggingFaceChain:\n    def __init__(\n        self, model_name: str, user_prompt: str, with_examples: bool = False\n    ):\n        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n        self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n        self.prompt, self.input_variables = _get_template_and_variables(\n            user_prompt, with_examples=with_examples\n        )\n\n    def run(self, **kwargs):\n        prompt = self.prompt.format(**kwargs)\n        input_ids = self.tokenizer.encode(prompt, return_tensors=\"pt\")\n        output = self.model.generate(\n            input_ids, max_length=100, num_beams=5, early_stopping=True\n        )\n        return self.tokenizer.decode(output[0], skip_special_tokens=True)\n\n\ndef use_huggingface_model(\n    user_prompt: str,\n    model_name: str,\n    with_examples: bool = False,\n) -> HuggingFaceChain:\n    return HuggingFaceChain(\n        model_name, user_prompt, with_examples=with_examples\n    )\n\n\ndef main():\n    import json\n    from argparse import ArgumentParser\n\n    parser = ArgumentParser()\n    parser.add_argument(\n        \"--model\",\n        type=str,\n        help=\"Model name.\",\n        default=\"google/flan-t5-xl\",\n    )\n    parser.add_argument(\"--templates\", type=str, help=\"Path to templates.\")\n    parser.add_argument(\"--num_prompts\", type=int, default=1000)\n    parser.add_argument(\n        \"--data_dir\", type=str, help=\"Path where data are stored\"\n    )\n\n    args = parser.parse_args()\n    model_name = args.model\n    templates_path = args.templates\n    data_dir = args.data_dir\n\n    with open(os.path.join(data_dir, \"rlhf_training_data.json\"), \"r\") as f:\n        examples = json.load(f)\n\n    with open(templates_path, \"r\") as f:\n        templates = json.load(f)\n    user_prompt = templates.get(\"rlhf\")\n    if user_prompt is None:\n        raise ValueError(\"No rlhs template found.\")\n\n    if \"davinci\" in model_name:\n        chain = use_langchain_model(\n            user_prompt, model_name, with_examples=True\n        )\n    else:\n        if \"t5\" not in model_name:\n            raise ValueError(\"Only Flan-t5 models are supported for HF.\")\n        chain = use_huggingface_model(\n            user_prompt, model_name, with_examples=True\n        )\n\n    for i in range(args.num_prompts):\n        example = np.random.choice(examples)\n        new_example = chain.run(example=example[\"user_input\"])\n        example_dict = {\"user_input\": new_example}\n        examples.append(example_dict)\n\n    with open(os.path.join(data_dir, \"rlhf_training_data.json\"), \"w\") as f:\n        json.dump(examples, f)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "optimization/chatllama/artifacts/generate_actor_dataset.py",
    "content": "from langchain import OpenAI, LLMChain, PromptTemplate\nfrom langchain.chains.conversation.memory import (\n    ConversationBufferWindowMemory,\n)\n\nfrom chatllama.langchain_modules.prompt_templates import (\n    PERSON_CHATBOT_TEMPLATE,\n    AI_CHATBOT_TEMPLATE,\n)\n\n\nCONVERSATION_LENGTH = 20\n\n\ndef create_conversation(human_agent: LLMChain, bot_agent: LLMChain):\n    conversation = []\n    chatbot_output = \"\"\n    for i in range(CONVERSATION_LENGTH):\n        # Human agent goes first\n        human_output = human_agent.run(chatbot_input=chatbot_output)\n        conversation.append(f\"Human: {human_output}\")\n        chatbot_output = bot_agent.run(human_input=human_output)\n        conversation.append(f\"AI: {chatbot_output}\")\n    return \"\\n\".join(conversation)\n\n\ndef build_agents():\n    # be aware that too long completions will not fit the sequence length\n    # of possible critic or reward models ...\n    llm = OpenAI(max_tokens=2048, temperature=0.7)\n    human_template = PromptTemplate(**PERSON_CHATBOT_TEMPLATE)\n    human_agent = LLMChain(\n        llm=llm,\n        prompt=human_template,\n        memory=ConversationBufferWindowMemory(k=4),\n    )\n    bot_template = PromptTemplate(**AI_CHATBOT_TEMPLATE)\n    bot_agent = LLMChain(\n        llm=llm,\n        prompt=bot_template,\n        memory=ConversationBufferWindowMemory(k=4),\n    )\n    return human_agent, bot_agent\n\n\ndef get_sub_conversations(conversation: str, system_prompt: str):\n    interactions = conversation.split(\"AI:\")\n    sub_conversations = []\n    for i in range(len(interactions) - 1):\n        user_input = system_prompt + \"AI:\".join(interactions[: i + 1])\n        completion = interactions[i + 1].split(\"Human:\")[0].strip()\n        sub_conversations.append(\n            {\"user_input\": user_input, \"completion\": completion}\n        )\n    return sub_conversations\n\n\ndef main():\n    import json\n    import os\n    from argparse import ArgumentParser\n\n    parser = ArgumentParser()\n    parser.add_argument(\"--num_conversations\", type=int, default=1000)\n    parser.add_argument(\"--output_dir\", type=str, default=\"conversations\")\n    parser.add_argument(\"--templates\", type=str, default=None)\n    args = parser.parse_args()\n\n    if args.templates is not None:\n        with open(args.templates, \"r\") as f:\n            templates = json.load(f)\n        template = templates[\"actor\"]\n    else:\n        template = \"\"\n    if not os.path.exists(args.output_dir):\n        os.makedirs(args.output_dir)\n    for conv in range(args.num_conversations):\n        human_agent, bot_agent = build_agents()\n        conversation = create_conversation(human_agent, bot_agent)\n        with open(\n            os.path.join(args.output_dir, f\"conversation_{conv}.txt\"), \"w\"\n        ) as f:\n            f.write(conversation)\n\n    # convert the conversations to a single json file\n    data = []\n    for conv in range(args.num_conversations):\n        with open(\n            os.path.join(args.output_dir, f\"conversation_{conv}.txt\"), \"r\"\n        ) as f:\n            conversation = f.read()\n        sub_conversations = get_sub_conversations(conversation, template)\n        data.extend(sub_conversations)\n    with open(\n        os.path.join(args.output_dir, \"actor_training_data.json\"), \"w\"\n    ) as f:\n        json.dump(data, f)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "optimization/chatllama/artifacts/generate_rewards.py",
    "content": "import argparse\nimport json\n\nfrom langchain import OpenAI, LLMChain, PromptTemplate\n\n\nclass ScoreGenerator:\n    def __init__(\n        self,\n        llm_model: str,\n        llm_temperature: float,\n        llm_max_tokens: int,\n        reward_template: dict,\n    ) -> None:\n\n        self.llm_max_tokens = llm_max_tokens\n        self.llm_temperature = llm_temperature\n        self.llm_model = llm_model\n\n        # initialize LLM and LangChain\n        openai_llm = OpenAI(\n            model_name=llm_model,\n            temperature=llm_temperature,\n            max_tokens=llm_max_tokens,\n        )\n\n        # Customaize your own Reward template by changing the\n        # prompt_template\n        prompt_template = PromptTemplate(**reward_template)\n        print(prompt_template)\n        self.llm = LLMChain(llm=openai_llm, prompt=prompt_template)\n\n    def distill(\n        self,\n        dataset_path: str,\n    ) -> None:\n        \"\"\"Parse the dataset and assign scores using LLMs\n        then save back the dataset with the uploaded scores\n        \"\"\"\n\n        print(\"Assigning scores to the reward dataset...\")\n\n        # load the dataset\n        with open(dataset_path, \"r\") as f:\n            train_data = json.load(f)\n\n        # for each element of the dataset, assing a score.\n        for i, data in enumerate(train_data):\n            if data.get(\"score\", None) is None:\n\n                user_input = data[\"user_input\"]\n                completion = data[\"completion\"]\n                print(\n                    f\"#### Data {i}:\\n\"\n                    f\"#### User_input:\\n {user_input}\\n\"\n                    f\"#### Completion:\\n {completion}\\n\"\n                )\n                prompt_tokens = (\n                    data[\"user_input\"]\n                    + data[\"completion\"]\n                    + self.llm.prompt.template\n                )\n                prompt_len = int(len(prompt_tokens.split(\" \")) / 0.75)\n                # 80% of the max length as safety margin\n                if prompt_len > self.llm_max_tokens * 0.8:\n                    print(\n                        f\"The prompt of the data {i} is too long\\n\"\n                        f\"tokens: {prompt_len}\\n\"\n                        f\"max_tokens: {self.llm_max_tokens * 0.8}\"\n                    )\n                    continue\n                score = self.llm.run(\n                    user_input=data[\"user_input\"],\n                    completion=data[\"completion\"],\n                ).strip()\n                # TODO: extract from score the float value with a regex\n                try:\n                    score = float(score)\n                except Exception:\n                    print(\n                        f\"The score returned by the LLM for the\"\n                        f\"data, {i}, is not a float float:\\n{score}\"\n                    )\n                    continue\n                data[\"score\"] = score\n                print(f\"### Score: {score} \\n\\n\")\n        # remove all the data that have no score\n        train_data = [data for data in train_data if data.get(\"score\", None)]\n        # save the dataset back\n        print(\"Writing the updated dataset back to disk ... \")\n        with open(dataset_path, \"w\") as f:\n            json.dump(train_data, f)\n\n        print(\"Score Assignment Completed\")\n\n\nif __name__ == \"__main__\":\n\n    REWARD_TEMPLATE = dict(\n        template=(\n            \"You have to evaluate the following chat with a score\"\n            \"between 0 and 5\"\n            \"You MUST evaluate: text quality, content quality and\"\n            \"coherence.\\n\"\n            \"You MUST return only the number that represents your\"\n            \"judgment.\\n\"\n            \"The input of the user is: {user_input}\\n\"\n            \"The output of the chatbot is: {completion}\\n\"\n            \"The score is:\\n\"\n        ),\n        input_variables=[\"user_input\", \"completion\"],\n    )\n\n    # Setup argument parser\n    parser = argparse.ArgumentParser(\n        prog=\"generate_rewards.py\",\n        description=\"Generate rewards using LangChain and LLMs\",\n    )\n\n    parser.add_argument(\"dataset_path\", help=\"Path to the dataset\")\n    parser.add_argument(\n        \"-m\",\n        \"--model\",\n        help=\"Specify the model to be used\",\n        default=\"text-davinci-003\",\n    )\n    parser.add_argument(\n        \"-t\",\n        \"--temperature\",\n        help=\"Specify the temperature of the score assignment\",\n        default=0.5,\n    )\n    parser.add_argument(\n        \"-k\",\n        \"--max_tokens\",\n        help=\"Specify the max tokens of the score assignement\",\n        default=2048,\n    )\n    parser.add_argument(\n        \"-r\",\n        \"--reward_template\",\n        help=\"Specify the reward template to be used\",\n        default=None,\n    )\n\n    # parse arguments\n    args = parser.parse_args()\n\n    if args.reward_template:\n        templates = json.loads(args.reward_template)\n        if templates.get(\"reward\", None) is None:\n            rw_template = REWARD_TEMPLATE\n        else:\n            rw_template = templates[\"reward\"]\n    else:\n        rw_template = REWARD_TEMPLATE\n\n    score_generator = ScoreGenerator(\n        args.model, args.temperature, args.max_tokens, rw_template\n    )\n\n    score_generator.distill(args.dataset_path)\n"
  },
  {
    "path": "optimization/chatllama/artifacts/main.py",
    "content": "import argparse\n\nfrom chatllama.rlhf.actor import ActorTrainer\nfrom chatllama.rlhf.config import Config\nfrom chatllama.rlhf.dataset import BaseDataset\nfrom chatllama.rlhf.reward import RewardTrainer\nfrom chatllama.rlhf.trainer import RLTrainer\n\n\n# Setup argument parser\nparser = argparse.ArgumentParser(\n    prog=\"main.py\", description=\"RLHF Training of ChatBots\"\n)\n\nparser.add_argument(\"configfile\", help=\"Path to config.yaml file\")\nparser.add_argument(\n    \"-t\",\n    \"--type\",\n    help=(\n        \"Specify the training type. RL: Training of the model using RL.\"\n        \"ACTOR: Training of the actor model. \"\n        \"REWARD: Training of the reward model.\"\n        \"RL: The whole pipeline with the three training steps\"\n    ),\n    default=\"ALL\",\n    choices=[\"ALL\", \"RL\", \"ACTOR\", \"REWARD\"],\n)\nparser.add_argument(\n    \"-a\", \"--actor\", help=\"Specify actor model by name\", default=None\n)\nparser.add_argument(\n    \"-r\", \"--reward\", help=\"Specify reward model by name\", default=None\n)\nparser.add_argument(\"--local_rank\", help=\"Local rank parameter for deepspeed\", default=None)\n\n# parse arguments\nargs = parser.parse_args()\n\n# load config.yaml with all the project informations\nconfig = Config(args.configfile)\n\n# overwrite config if specified differently\nif args.actor is not None:\n    config.actor.model = args.actor\nif args.reward is not None:\n    config.reward.model = args.reward\n\n# perform the desired training\nif args.type == \"RL\":\n    max_seq = min(\n        config.actor.max_sequence_length,\n        config.reward.max_sequence_length,\n        config.critic.max_sequence_length,\n    )\n    config.actor.max_sequence_length = max_seq\n    BaseDataset.clean_dataset(config)\n    rlhf_trainer = RLTrainer(config)\n    rlhf_trainer.train()\nelif args.type == \"ACTOR\":\n    BaseDataset.clean_dataset(config.actor)\n    actor_trainer = ActorTrainer(config.actor)\n    actor_trainer.train()\nelif args.type == \"REWARD\":\n    BaseDataset.clean_dataset(config.reward)\n    reward_trainer = RewardTrainer(config.reward)\n    reward_trainer.train()\nelif args.type == \"ALL\":\n    reward_trainer = RewardTrainer(config.reward)\n    reward_trainer.train()\n    actor_trainer = ActorTrainer(config.actor)\n    actor_trainer.train()\n    rlhf_trainer = RLTrainer(config)\n    rlhf_trainer.train()\n"
  },
  {
    "path": "optimization/chatllama/artifacts/templates.json",
    "content": "{\n    \"rlhf\": \"You are an AI assistant used to generate possible prompts instructions for a chatbot, here is an example of conversation.\"\n}"
  },
  {
    "path": "optimization/chatllama/chatllama/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/chatllama/chatllama/langchain_modules/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/chatllama/chatllama/langchain_modules/prompt_templates.py",
    "content": "REWARD_TEMPLATE = dict(\n    template=(\n        \"You have to evaluate the following chat with a score between 0 and 5\"\n        \"You MUST evaluate: text quality, content quality and\"\n        \"coherence.\\n\"\n        \"You MUST return only the number that represents your\"\n        \"judgment.\\n\"\n        \"The assignement is:\\n{user_input}\\n\"\n        \"The completion is:\\n{completion}\\n\"\n    ),\n    input_variables=[\"user_input\", \"completion\"],\n)\n\n\nAI_CHATBOT_TEMPLATE = dict(\n    template=(\n        \"Assistant is a large language model trained by Meta and Nebuly.ai\\n\"\n        \"Assistant is designed to be able to assist with a wide range of \"\n        \"tasks, from answering simple questions to providing in-depth \"\n        \"explanations and discussions on a wide range of topics. As a \"\n        \"language model, Assistant is able to generate human-like text \"\n        \"based on the input it receives, allowing it to engage in \"\n        \"natural-sounding conversations and provide responses that are \"\n        \"coherent and relevant to the topic at hand.\\n\\n\"\n        \"Assistant is constantly learning and improving, and its capabilities \"\n        \"are constantly evolving. It is able to process and understand large \"\n        \"amounts of text, and can use this knowledge to provide accurate and \"\n        \"informative responses to a wide range of questions. Additionally, \"\n        \"Assistant is able to generate its own text based on the input it \"\n        \"receives, allowing it to engage in discussions and provide \"\n        \"explanations and descriptions on a wide range of topics.\\n\\n\"\n        \"Overall, Assistant is a powerful tool that can help with a wide \"\n        \"range of tasks and provide valuable insights and information on a \"\n        \"wide range of topics. Whether you need help with a specific \"\n        \"question or just want to have a conversation about a particular \"\n        \"topic, Assistant is here to assist.\\n\\n{history}\\n\\n\"\n        \"Human: {human_input}\\n\"\n        \"Assistant:\"\n    ),\n    input_variables=[\"history\", \"human_input\"],\n)\n\n\nPERSON_CHATBOT_TEMPLATE = dict(\n    template=(\n        \"You are a human chatting with a chatbot. The chatbot is a large \"\n        \"language model trained by Meta and Nebuly-ai\\n\"\n        \"The chatbot is designed to be able to assist you with a wide range \"\n        \"of tasks, from answering simple questions to providing in-depth \"\n        \"explanations and discussions on a wide range of topics. You are a \"\n        \"human and you are testing the chatbot. Ask the chatbot questions and\"\n        \"see how it responds. You can also ask the chatbot to tell you a \"\n        \"story.\"\n        \"\\n\\n{history}\\n\\n\"\n        \"Chatbot: {chatbot_input}\\n\"\n        \"Human:\"\n    ),\n    input_variables=[\"history\", \"chatbot_input\"],\n)\n"
  },
  {
    "path": "optimization/chatllama/chatllama/llama_model.py",
    "content": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n# This software may be used and distributed according to the terms\n# of the GNU General Public License version 3.\n\nimport json\nimport math\nimport os\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Tuple, List, Union, Optional\n\nimport deepspeed\nimport torch\nimport torch.distributed\nimport torch.nn.functional as F\nimport fairscale.nn.model_parallel.initialize as fs_init\nfrom fairscale.nn.model_parallel.initialize import initialize_model_parallel\nfrom fairscale.nn.model_parallel.layers import (\n    ParallelEmbedding,\n    RowParallelLinear,\n    ColumnParallelLinear,\n)\nfrom torch import nn\nfrom transformers import AutoTokenizer\n\nfrom llama import Tokenizer\nfrom llama.generation import sample_top_p\n\n\nclass MyTokenizer:\n    \"\"\"Masked tokenizer of hugging face to be similar to the one of meta,\n    just used for testing purposes.\n    \"\"\"\n\n    def __init__(self, model_path: Optional[str] = None):\n\n        if model_path is None:\n            self.sp_model = AutoTokenizer.from_pretrained(\"gpt2\")\n        else:\n            self.sp_model = AutoTokenizer.from_pretrained(model_path)\n\n        self.n_words = self.sp_model.vocab_size\n        self.bos_id = self.sp_model.bos_token_id\n        self.eos_id = self.sp_model.eos_token_id\n        self.pad_id = self.sp_model.eos_token_id\n\n    def encode(\n        self,\n        s: str,\n        bos: bool = True,\n        eos: bool = True,\n        truncation: bool = True,\n    ) -> List[int]:\n        output = self.sp_model.encode(s, truncation=truncation)\n        t = list(output)\n        if bos:\n            t = [self.bos_id] + t\n        if eos:\n            t = t + [self.eos_id]\n        return t\n\n    def decode(self, t: List[int]) -> str:\n        input = torch.as_tensor(t)\n        output = self.sp_model.decode(input)\n        return output\n\n\nclass HFLikeTokenizer:\n    def __init__(self, tokenizer: Tokenizer):\n        self.tokenizer = tokenizer\n\n        # assign attributes from real tokenizer to masked one\n        self.pad_id = self.tokenizer.pad_id\n        self.eos_id = self.tokenizer.eos_id\n        self.bos_id = self.tokenizer.bos_id\n\n        # mask attribute to be similar to hugging face\n        self.eos_token_id = self.tokenizer.eos_id\n        self.pad_token_id = self.tokenizer.pad_id\n\n        # to match hugging face attribute\n        self.pad_token_id = self.pad_id\n\n    def create_sequence_mask(self, tokens: torch.Tensor) -> torch.Tensor:\n        mask = torch.where(\n            tokens == self.tokenizer.pad_id,\n            torch.zeros_like(tokens),\n            torch.ones_like(tokens),\n        )\n        mask = torch.where(\n            tokens == self.tokenizer.bos_id, torch.zeros_like(tokens), mask\n        )\n        mask = torch.where(\n            tokens == self.tokenizer.eos_id, torch.zeros_like(tokens), mask\n        )\n        return mask\n\n    def __call__(self, texts: Union[List[str], str], *args, **kwargs):\n        if isinstance(texts, str):\n            text = self.tokenizer.encode(texts, bos=True, eos=True)\n            tokens = torch.tensor(text).long()\n            mask = torch.ones_like(tokens)\n        else:\n            texts = [\n                self.tokenizer.encode(text, bos=True, eos=True)\n                for text in texts\n            ]\n            max_len = max(len(text) for text in texts)\n            tokens = torch.full(\n                (len(texts), max_len), self.tokenizer.pad_id\n            ).long()\n            for i, text in enumerate(texts):\n                tokens[i, -len(text) :] = torch.tensor(  # noqa E203\n                    text\n                ).long()\n\n            # TODO: decide how eos and bos should be handled - i need to mask\n            # them? or not?\n            mask = self.create_sequence_mask(tokens)\n            for i in range(tokens.shape[0]):\n                current_tokens = tokens[i, mask[i] == 1]\n                tokens[\n                    i, -len(current_tokens) - 1 : -1  # noqa E203\n                ] = current_tokens\n            mask = self.create_sequence_mask(tokens)\n\n            # convert `pad_id` from -1 to 0, otherwise embedding will cause out\n            # of bounds.\n            tokens = torch.where(\n                tokens == self.tokenizer.pad_id,\n                torch.zeros_like(tokens),\n                tokens,\n            )\n        output = {\n            \"input_ids\": tokens,\n            \"attention_mask\": mask,\n        }\n        return output\n\n    def decode(self, tokens):\n        return self.tokenizer.decode(tokens)\n\n\n@dataclass\nclass ModelArgs:\n    \"\"\"This class is a modification of the ModelArgs class implemented in\n    the LLaMA repo. The class has been modified for training, since the\n    original one just supports inference.\n    \"\"\"\n\n    dim: int = 512\n    n_layers: int = 8\n    n_heads: int = 8\n    # defined later by tokenizer\n    vocab_size: int = -1\n    # make SwiGLU hidden layer size multiple of large power of 2\n    multiple_of: int = 256\n    norm_eps: float = 1e-5\n\n    max_batch_size: int = 32\n    max_seq_len: int = 1024\n\n    # added attributes\n    froze_embeddings: bool = True\n    use_fairscale: bool = True\n\n\nclass RMSNorm(torch.nn.Module):\n    \"\"\"This class is a modification of the RMSNorm class implemented in\n    the LLaMA repo. The class has been modified for training, since the\n    original one just supports inference.\n    \"\"\"\n\n    def __init__(self, dim: int, eps: float = 1e-6):\n        super().__init__()\n        self.eps = eps\n        self.weight = nn.Parameter(torch.ones(dim))\n\n    def _norm(self, x):\n        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)\n\n    def forward(self, x):\n        output = self._norm(x.float()).type_as(x)\n        return output * self.weight\n\n\ndef precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):\n    freqs = 1.0 / (\n        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)\n    )\n    t = torch.arange(end, device=freqs.device)  # type: ignore\n    freqs = torch.outer(t, freqs).float()  # type: ignore\n    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64\n    return freqs_cis\n\n\ndef reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):\n    ndim = x.ndim\n    assert 0 <= 1 < ndim\n    assert freqs_cis.shape == (x.shape[1], x.shape[-1])\n    shape = [\n        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)\n    ]\n    return freqs_cis.view(*shape)\n\n\ndef apply_rotary_emb(\n    xq: torch.Tensor,\n    xk: torch.Tensor,\n    freqs_cis: torch.Tensor,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))\n    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))\n    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)\n    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)\n    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)\n    return xq_out.type_as(xq), xk_out.type_as(xk)\n\n\nclass Attention(nn.Module):\n    \"\"\"This class is a modification of the Attention class implemented in\n    the LLaMA repo. The class has been modified for training, since the\n    original one just supports inference.\n    \"\"\"\n\n    def __init__(self, args: ModelArgs):\n        super().__init__()\n\n        if args.use_fairscale:\n            self.n_local_heads = (\n                args.n_heads // fs_init.get_model_parallel_world_size()\n            )\n        else:\n            self.n_local_heads = args.n_heads\n        self.head_dim = args.dim // args.n_heads\n\n        if args.use_fairscale:\n            self.wq = ColumnParallelLinear(\n                args.dim,\n                args.n_heads * self.head_dim,\n                bias=False,\n                gather_output=False,\n                init_method=lambda x: x,\n            )\n            self.wk = ColumnParallelLinear(\n                args.dim,\n                args.n_heads * self.head_dim,\n                bias=False,\n                gather_output=False,\n                init_method=lambda x: x,\n            )\n            self.wv = ColumnParallelLinear(\n                args.dim,\n                args.n_heads * self.head_dim,\n                bias=False,\n                gather_output=False,\n                init_method=lambda x: x,\n            )\n            self.wo = RowParallelLinear(\n                args.n_heads * self.head_dim,\n                args.dim,\n                bias=False,\n                input_is_parallel=True,\n                init_method=lambda x: x,\n            )\n        else:\n            self.wq = nn.Linear(\n                args.dim, args.n_heads * self.head_dim, bias=False\n            )\n            self.wk = nn.Linear(\n                args.dim, args.n_heads * self.head_dim, bias=False\n            )\n            self.wv = nn.Linear(\n                args.dim, args.n_heads * self.head_dim, bias=False\n            )\n            self.wo = nn.Linear(\n                args.n_heads * self.head_dim, args.dim, bias=False\n            )\n\n        self.dim_cache = (\n            args.max_batch_size,\n            args.max_seq_len,\n            self.n_local_heads,\n            self.head_dim,\n        )\n        self.cache_k = torch.zeros(self.dim_cache).cuda()\n\n        self.cache_v = torch.zeros(self.dim_cache).cuda()\n\n    def forward(\n        self,\n        x: torch.Tensor,\n        kv_mask: torch.Tensor,\n        freqs_cis: torch.Tensor,\n        cache_k: Optional[torch.Tensor] = None,\n        cache_v: Optional[torch.Tensor] = None,\n    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:\n        start_pos = 0  # Temporary\n\n        bsz, seqlen, _ = x.shape\n        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)\n\n        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)\n        xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)\n        xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)\n\n        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)\n\n        # Modified code to allow training, caching is not good for training\n        if (cache_k is None and cache_v is not None) or (\n            cache_k is not None and cache_v is None\n        ):\n            raise ValueError(\"cache_k is None while cache_v is not None\")\n        if cache_k is None:\n            keys = xk\n            values = xv\n        else:\n            cache_k.to(xk.device)\n            cache_v.to(xv.device)\n            cache_k[:bsz, start_pos : start_pos + seqlen] = xk  # noqa E203\n            cache_v[:bsz, start_pos : start_pos + seqlen] = xv  # noqa E203\n            keys = self.cache_k[:bsz, : start_pos + seqlen]  # noqa E203\n            values = self.cache_v[:bsz, : start_pos + seqlen]  # noqa E203\n\n        xq = xq.transpose(1, 2)\n        keys = keys.transpose(1, 2)\n        values = values.transpose(1, 2)\n        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(\n            self.head_dim\n        )\n        if kv_mask is not None:\n            scores = scores + kv_mask\n        scores = F.softmax(scores.float(), dim=-1).type_as(xq)\n        output = torch.matmul(scores, values)\n        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)\n        if cache_k is None:\n            return self.wo(output), None, None\n        else:\n            return self.wo(output), self.cache_k, self.cache_v\n\n\nclass FeedForward(nn.Module):\n    \"\"\"This class is a modification of the FeedForward class implemented in\n    the LLaMA repo. The class has been modified for training, since the\n    original one just supports inference.\n    \"\"\"\n\n    def __init__(\n        self, dim: int, hidden_dim: int, multiple_of: int, use_fairscale: bool\n    ):\n        super().__init__()\n        hidden_dim = int(2 * hidden_dim / 3)\n        hidden_dim = multiple_of * (\n            (hidden_dim + multiple_of - 1) // multiple_of\n        )\n\n        if use_fairscale:\n            self.w1 = ColumnParallelLinear(\n                dim,\n                hidden_dim,\n                bias=False,\n                gather_output=False,\n                init_method=lambda x: x,\n            )\n            self.w2 = RowParallelLinear(\n                hidden_dim,\n                dim,\n                bias=False,\n                input_is_parallel=True,\n                init_method=lambda x: x,\n            )\n            self.w3 = ColumnParallelLinear(\n                dim,\n                hidden_dim,\n                bias=False,\n                gather_output=False,\n                init_method=lambda x: x,\n            )\n        else:\n            self.w1 = nn.Linear(dim, hidden_dim, bias=False)\n            self.w2 = nn.Linear(hidden_dim, dim, bias=False)\n            self.w3 = nn.Linear(dim, hidden_dim, bias=False)\n\n    def forward(self, x):\n        return self.w2(F.silu(self.w1(x)) * self.w3(x))\n\n\nclass TransformerBlock(nn.Module):\n    \"\"\"This class is a modification of the TransformerBlock class\n    implemented in the LLaMA repo. The class has been modified for training,\n    since the original one just supports inference.\n    \"\"\"\n\n    def __init__(self, layer_id: int, args: ModelArgs):\n        super().__init__()\n        self.n_heads = args.n_heads\n        self.dim = args.dim\n        self.head_dim = args.dim // args.n_heads\n        self.attention = Attention(args)\n        self.feed_forward = FeedForward(\n            dim=args.dim,\n            hidden_dim=4 * args.dim,\n            multiple_of=args.multiple_of,\n            use_fairscale=args.use_fairscale,\n        )\n        self.layer_id = layer_id\n        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)\n        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)\n        self.use_fairscale = args.use_fairscale\n\n    def forward(\n        self,\n        x: torch.Tensor,\n        attention_mask: torch.Tensor,\n        freqs_cis: torch.Tensor,\n        cache_k: Optional[torch.Tensor] = None,\n        cache_v: Optional[torch.Tensor] = None,\n    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:\n        # modified from orignal code to enable external cache\n        attention_mask = attention_mask[:, None, :, :]\n        if self.use_fairscale:\n            attention_mask = attention_mask.expand(\n                -1,\n                self.n_heads // fs_init.get_model_parallel_world_size(),\n                -1,\n                -1,\n            )\n        else:\n            attention_mask = attention_mask.expand(-1, self.n_heads, -1, -1)\n        attn, cache_k, cache_v = self.attention.forward(\n            self.attention_norm(x), attention_mask, freqs_cis, cache_k, cache_v\n        )\n        h = x + attn\n        out = h + self.feed_forward.forward(self.ffn_norm(h))\n        return out, cache_k, cache_v\n\n\nclass Transformer(nn.Module):\n    \"\"\"This class is a modification of the Transformer class implemented in\n    the LLaMA repo. The class has been modified for training, since the\n    original one just supports inference. The generate method was inspired by\n    the generate function you can find in `llama.generation`.\n    \"\"\"\n\n    def __init__(self, params: ModelArgs):\n        super().__init__()\n\n        self.params = params\n        self.vocab_size = params.vocab_size\n        self.n_layers = params.n_layers\n        if params.use_fairscale:\n            self.n_local_heads = (\n                params.n_heads // fs_init.get_model_parallel_world_size()\n            )\n        else:\n            self.n_local_heads = params.n_heads\n\n        self.head_dim = params.dim // params.n_heads\n        dim = (\n            params.max_batch_size,\n            params.max_seq_len,\n            self.n_local_heads,\n            self.head_dim,\n        )\n        self.cache_k = [torch.zeros(dim) for _ in range(self.n_layers)]\n        self.cache_v = [torch.zeros(dim) for _ in range(self.n_layers)]\n\n        if params.use_fairscale:\n            self.tok_embeddings = ParallelEmbedding(\n                params.vocab_size, params.dim, init_method=lambda x: x\n            )\n        else:\n            self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)\n\n        if params.froze_embeddings:\n            for param in self.tok_embeddings.parameters():\n                param.requires_grad = False\n\n        self.layers = torch.nn.ModuleList()\n        for layer_id in range(params.n_layers):\n            self.layers.append(TransformerBlock(layer_id, params))\n\n        self.norm = RMSNorm(params.dim, eps=params.norm_eps)\n        if params.use_fairscale:\n            self.output = ColumnParallelLinear(\n                params.dim,\n                params.vocab_size,\n                bias=False,\n                init_method=lambda x: x,\n            )\n        else:\n            self.output = nn.Linear(params.dim, params.vocab_size, bias=False)\n\n        # TODO: How too modify this for training?\n        self.freqs_cis = precompute_freqs_cis(\n            self.params.dim // self.params.n_heads, self.params.max_seq_len * 2\n        )\n\n    def forward(\n        self, tokens: torch.Tensor, attention_mask: torch.Tensor\n    ) -> torch.Tensor:\n        attention_mask = attention_mask.detach()\n        logits = self._forward(tokens, attention_mask)\n        return logits\n\n    def _forward(\n        self, tokens: torch.Tensor, attention_mask: torch.Tensor\n    ) -> torch.Tensor:\n        _bsz, seqlen = tokens.shape\n        h = self.tok_embeddings(tokens)\n        self.freqs_cis = self.freqs_cis.to(h.device)\n        # TEMPORARY FIX, need to understand how to manage the positioning\n        # embedding and the batch size with the current padding and masking.\n        start_pos = 1\n        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]  # noqa E203\n        # mask has size (bsz, seqlen). It should be transformed in\n        # (bsz, seqlen, seqlen)\n        # if the mask is a boolean tensor, convert it to int\n        if attention_mask.dtype == torch.bool:\n            attention_mask = attention_mask.long()\n        kv_mask = attention_mask[:, None, :].expand(_bsz, seqlen, seqlen)\n        kv_mask = torch.tril(kv_mask, diagonal=0)\n        kv_mask = 1 - kv_mask\n        kv_mask = (\n            torch.where(\n                kv_mask == 1, kv_mask.new_tensor(-9223372036854775808), kv_mask\n            )\n            .detach()\n            .long()\n        )\n\n        for i, layer in enumerate(self.layers):\n            if not self.training:\n                cache_k = self.cache_k[i]\n                cache_v = self.cache_v[i]\n                h, cache_k, cache_v = layer(\n                    h, kv_mask, freqs_cis, cache_k, cache_v\n                )\n            else:\n                h, _, _ = layer(h, kv_mask, freqs_cis)\n            if not self.training:\n                self.cache_k[i] = cache_k.detach()\n                self.cache_v[i] = cache_v.detach()\n\n        h = self.norm(h)\n        output = self.output(h)\n        return output\n\n    @torch.no_grad()\n    def generate(\n        self,\n        input_ids: torch.Tensor,\n        attention_mask: torch.Tensor,\n        max_new_tokens: int,\n        temperature: float,\n        top_p: float = 1.0,\n        no_repeat_ngram_size=None,\n    ):\n        generated_tokens = []\n        for cur_pos in range(max_new_tokens):\n            logits = self._forward(input_ids, attention_mask)[:, -1, :]\n            if temperature > 0:\n                probs = torch.softmax(logits / temperature, dim=-1)\n                next_token = sample_top_p(probs, top_p)\n            else:\n                next_token = torch.argmax(logits, dim=-1)\n            next_token = next_token.reshape(-1)\n            input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)\n            attention_mask = torch.cat(\n                [attention_mask, torch.ones_like(next_token).unsqueeze(1)],\n                dim=1,\n            )\n            generated_tokens.append(next_token)\n        sequences = torch.concat(\n            (input_ids, torch.stack(generated_tokens, dim=1)), dim=1\n        )\n        return sequences\n\n\ndef setup_model_parallel() -> Tuple[int, int]:\n    local_rank = int(os.environ.get(\"LOCAL_RANK\", -1))\n    world_size = int(os.environ.get(\"WORLD_SIZE\", -1))\n    print(\"local_rank:\", local_rank, \"world_size:\", world_size)\n\n    torch.distributed.init_process_group(\"nccl\")\n    initialize_model_parallel(world_size)\n    torch.cuda.set_device(local_rank)\n\n    # seed must be the same in all processes\n    torch.manual_seed(1)\n    return local_rank, world_size\n\n\ndef setup_model_deepspeed() -> Tuple[int, int]:\n    local_rank = int(os.environ.get(\"LOCAL_RANK\", -1))\n    world_size = int(os.environ.get(\"WORLD_SIZE\", -1))\n\n    deepspeed.init_distributed()\n    torch.cuda.set_device(local_rank)\n\n    # seed must be the same in all processes\n    torch.manual_seed(1)\n    return local_rank, world_size\n\n\ndef load_checkpoints(\n    ckpt_dir: str, local_rank: int, world_size: int\n) -> Tuple[dict, dict]:\n    checkpoints = sorted(Path(ckpt_dir).glob(\"*.pth\"))\n    assert world_size == len(checkpoints), (\n        f\"Loading a checkpoint for MP={len(checkpoints)} but world \"\n        f\"size is {world_size}\"\n    )\n    ckpt_path = checkpoints[local_rank]\n    print(\"Loading\")\n    checkpoint = torch.load(ckpt_path, map_location=\"cpu\")\n    with open(Path(ckpt_dir) / \"params.json\", \"r\") as f:\n        params = json.loads(f.read())\n    return checkpoint, params\n\n\ndef load_model(\n    ckpt_dir: str,\n    tokenizer_path: str,\n    local_rank: int,\n    world_size: int,\n    froze_embeddings: bool,\n    use_fairscale: bool,\n    max_batch_size: int = 32,\n) -> Tuple[Transformer, HFLikeTokenizer]:\n\n    checkpoint, params = load_checkpoints(ckpt_dir, local_rank, world_size)\n    model_args: ModelArgs = ModelArgs(\n        max_seq_len=1024, max_batch_size=max_batch_size, **params\n    )\n    model_args.froze_embeddings = froze_embeddings\n    model_args.use_fairscale = use_fairscale\n    tokenizer = Tokenizer(model_path=tokenizer_path)\n    model_args.vocab_size = tokenizer.n_words\n    torch.set_default_tensor_type(torch.cuda.HalfTensor)\n    model = Transformer(model_args)\n    torch.set_default_tensor_type(torch.FloatTensor)\n    model.load_state_dict(checkpoint, strict=False)\n    tokenizer = HFLikeTokenizer(tokenizer)\n\n    return model, tokenizer\n\n\ndef load_tokenizer(tokenizer_path: str):\n    tokenizer = Tokenizer(model_path=tokenizer_path)\n    return tokenizer\n\n\ndef load_tokenizer_test(tokenizer_path: Optional[str] = None):\n    tokenizer = MyTokenizer(model_path=tokenizer_path)\n    return tokenizer\n\n\ndef load_model_test(\n    ckpt_dir: str,\n    tokenizer_path: str,\n    local_rank: int,\n    world_size: int,\n    froze_embeddings: bool,\n    use_fairscale: bool,\n    max_batch_size: int = 32,\n) -> Tuple[Transformer, HFLikeTokenizer]:\n\n    # test the model with hf tokenizer\n    model_args = ModelArgs()\n    model_args.froze_embeddings = froze_embeddings\n    model_args.use_fairscale = use_fairscale\n    tokenizer = MyTokenizer(model_path=tokenizer_path)\n    model_args.vocab_size = tokenizer.n_words\n    model = Transformer(model_args).cuda()\n    tokenizer = HFLikeTokenizer(tokenizer)\n\n    return model, tokenizer\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/__init__.py",
    "content": "\"\"\"RLHF implementation inspired to Lucidrains' implementation.\"\"\"\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/actor.py",
    "content": "import json\nimport yaml\nimport os\nimport shutil\n\nimport deepspeed\nimport torch\nfrom accelerate import Accelerator\nfrom beartype import beartype\nfrom beartype.typing import Tuple\nfrom einops import rearrange\nfrom peft import get_peft_model, LoraConfig, TaskType\nfrom torch.utils.data import DataLoader, Dataset\nfrom transformers import (\n    AutoModelForCausalLM,\n    AutoTokenizer,\n)\n\nfrom chatllama.rlhf.config import ConfigActor\nfrom chatllama.rlhf.model_list import (\n    hf_models_causal_lm,\n    llama_models,\n    hf_models,\n)\n\nfrom chatllama.rlhf.model_loader import ModelLoader\nfrom chatllama.rlhf.utils import TrainingStats\n\n\nclass ActorModel(torch.nn.Module):\n    \"\"\"Actor model that generates the augmented prompt from the initial\n    user_input. The aim is to train this model to generate better prompts.\n\n    Attributes:\n        model: The model from LLaMA to be used\n        tokenizer: The LLaMA tokenizer\n        config (ConfigActor): Configuration for the actor model\n\n    Methods:\n        load: Load the model from a path\n        save: Save the model to a path\n        forward: Compute the action logits for a given sequence.\n        generate: Generate a sequence from a given prompt\n    \"\"\"\n\n    def __init__(self, config: ConfigActor) -> None:\n        super().__init__()\n\n        # save config\n        self.config = config\n\n        # initialize the self.model\n        if config.model in llama_models:\n            # llama module might not be present when HF models are used\n            from chatllama.llama_model import (\n                load_model,\n                setup_model_parallel,\n            )  # noqa\n\n            local_rank, world_size = setup_model_parallel()\n\n            # use load_model_test for testing\n            self.model, self.tokenizer = load_model(\n                ckpt_dir=config.model_folder,\n                tokenizer_path=config.tokenizer_path,\n                local_rank=local_rank,\n                world_size=world_size,\n                froze_embeddings=config.froze_embeddings,\n                use_fairscale=config.use_fairscale,\n                max_batch_size=config.batch_size,\n            )\n        elif config.model in hf_models_causal_lm:\n            self.tokenizer = self.load_tokenizer(config)\n            self.model = AutoModelForCausalLM.from_pretrained(\n                config.model,\n            )\n\n            # Setup PEFT model\n            if config.peft_enable:\n\n                # check that the peft config exist\n                if os.path.exists(config.peft_config_path):\n                    # Read the peft config from yaml\n                    with open(config.peft_config_path, \"r\") as c:\n                        config_peft = yaml.safe_load(c)\n                else:\n                    raise ValueError(\n                        f\"PEFT config {config.peft_config_path} not found\"\n                    )\n\n                print(config_peft)\n                # define lora config for peft\n                peft_config = LoraConfig(\n                    task_type=TaskType.CAUSAL_LM, **config_peft\n                )\n\n                # create peft model\n                self.model = get_peft_model(\n                    model=self.model,\n                    peft_config=peft_config,\n                )\n\n            self.model.to(config.device)\n\n        else:\n            raise ValueError(f\"Model {config.model} not supported\")\n\n        # load the model from model_folder\n        self.load()\n\n    @beartype\n    def load(self) -> None:\n        \"\"\"Load the model from the path\"\"\"\n        # check if there is a model to load\n        path = ModelLoader.check_model_path(\n            config=self.config,\n            is_checkpoint=False,\n            current_epoch=None,\n        )\n\n        # if there is a model to load\n        if path is not None:\n\n            # load the model\n            print(\"Loading ...\")\n            model_dict = torch.load(path)\n            self.model.load_state_dict(model_dict.get(\"state_dict\") or model_dict.get(\"model\"))\n\n    @beartype\n    def save(self) -> None:\n        \"\"\"Save the model to the path\"\"\"\n        # get the path to save the model\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=self.config,\n            is_checkpoint=False,\n            current_epoch=None,\n        )\n\n        # save the model\n        print(f\"Saving model to {path} ...\")\n        torch.save(\n            {\"state_dict\": self.model.state_dict()},\n            path,\n        )\n\n    @staticmethod\n    def load_tokenizer(config: ConfigActor):\n        \"\"\"Load the tokenizer from the model name\"\"\"\n        if config.model in hf_models:\n            # load the tokenizer from HF\n            tokenizer = AutoTokenizer.from_pretrained(\n                config.model,\n                padding_side=\"left\",\n                padding=True,\n                truncation=True,\n                model_max_length=config.max_sequence_length,\n            )\n\n            # add eos token if not present\n            if tokenizer.eos_token is None:\n                tokenizer.eos_token = \"</s>\"\n                tokenizer.eos_token_id = 2  # OPT eos-token-id\n\n            # add pad token if not present\n            if tokenizer.pad_token is None:\n                tokenizer.pad_token = tokenizer.eos_token\n                tokenizer.pad_token_id = tokenizer.eos_token_id\n        elif config.model in llama_models:\n\n            # llama module might not be present when HF models are used\n            from chatllama.llama_model import (\n                load_tokenizer,\n            )  # noqa\n\n            tokenizer = load_tokenizer(config.tokenizer_path)\n        return tokenizer\n\n    def parameters(self):\n        \"\"\"Return the parameters of the model\"\"\"\n        return self.model.parameters()\n\n    @beartype\n    def forward(\n        self, sequences: torch.Tensor, sequences_mask: torch.Tensor\n    ) -> torch.Tensor:\n        \"\"\"Generate logits to have probability distribution over the vocabulary\n            of the actions\n\n        Args:\n            sequences (torch.Tensor): Sequences of states and actions used to\n                    compute token logits for the whole list of sequences\n            attention_mask (torch.Tensor): Mask for the sequences attention\n\n        Returns:\n            logits (torch.Tensor): Logits for the actions taken\n        \"\"\"\n        model_output = self.model.forward(\n            sequences, attention_mask=sequences_mask\n        )\n        # need to return logits for the actions\n        if self.config.model in hf_models_causal_lm:\n            model_output = model_output.logits\n        if self.config.debug:\n            print(\"ActorModel.forward\")\n            print(\"model_output_logits shape\", model_output.shape)\n            print(\"model_output logits\", model_output)\n        return model_output\n\n    @beartype\n    @torch.no_grad()\n    def generate(\n        self, states: torch.Tensor, state_mask: torch.Tensor\n    ) -> Tuple:\n        \"\"\"Generate actions and sequences=[states, actions] from state\n            (i.e. input of the prompt generator model)\n\n        Args:\n            state (torch.Tensor): the input of the user\n            state_mask (torch.Tensor): Mask for the state input (for padding)\n\n        Returns:\n            actions (torch.Tensor): Actions generated from the state\n            sequences (torch.Tensor): Sequences generated from the\n                state as [states, actions]\n        \"\"\"\n        # temperature for the actor\n        temperature = self.config.temperature\n\n        # max sequence length for the actor (i.e. prompt + completion)\n        max_sequence_length = self.config.max_sequence_length\n\n        # max and min number of tokens to generate\n        max_tokens = self.config.max_tokens\n        min_tokens = self.config.min_tokens\n\n        # max generation possible given the state and the max sequence length\n        max_generation_possible = max_sequence_length - states.shape[1]\n        if max_generation_possible < min_tokens:\n            raise ValueError(\n                f\"The prompt is too long w.r.t the \"\n                f\"model sequence length \\n\"\n                f\"max_sequence_length={max_sequence_length}\\n\"\n                f\"state_length={states.shape[1]}\\n\"\n                f\"min_tokens={min_tokens}\\n\"\n                f\"max_tokens={max_tokens}\\n\"\n                f\"max_generation_possible={max_generation_possible}\\n\"\n            )\n\n        # take the minimum the max_tokens and the max_generation_possible\n        max_completion = min(max_tokens, max_generation_possible)\n\n        sequences = self.model.generate(\n            input_ids=states,\n            attention_mask=state_mask,\n            temperature=temperature,\n            max_new_tokens=max_completion,\n            no_repeat_ngram_size=3,\n        )\n        actions = sequences[:, states.shape[1] :]  # noqa E203\n        if self.config.debug:\n            print(\n                f\"input length {states.shape[1]} \\n\"\n                f\"max sequence length {max_sequence_length} \\n\"\n                f\"max completion {max_completion} \\n\"\n                f\"generated sequence {sequences.shape[1]} \\n\"\n            )\n            print(\"ActorModel.generate\")\n            print(\"state\", states)\n            print(\"state shape\", states.shape)\n            print(\"sequence shape\", sequences.shape)\n            print(\"sequence\", sequences)\n            print(\"actions shape\", actions.shape)\n            print(\"actions\", actions)\n        return actions, sequences\n\n\nclass ActorDataset(Dataset):\n    \"\"\"Dataset for the pretraining of the actor model\n    read a json file with the following format:\n    [\n        {\n            \"user_input\": \"...\"\n            \"completion\": \"...\"\n        },\n        ...\n    ]\n    Where:\n        user_input: the input of the user\n        completion: the output of the user\n    \"\"\"\n\n    def __init__(\n        self,\n        path: str,\n    ) -> None:\n        self.path = path\n        with open(path, \"r\") as f:\n            data = json.load(f)\n        self.data = [d[\"user_input\"] + d[\"completion\"] for d in data]\n\n    def __getitem__(self, idx):\n        return self.data[idx]\n\n    def __len__(\n        self,\n    ):\n        return len(self.data)\n\n\nclass ActorTrainer:\n    \"\"\"Used to pre-train the actor model to generate better prompts.\n\n    Args:\n        config (ConfigActor): Configuration for the actor model\n\n    Attributes:\n        config (ConfigActor): Configuration for the actor model\n        model (ActorModel): Actor model\n        loss_function (torch.nn.CrossEntropyLoss): Loss function\n        optimizer (torch.optim.Adam): Optimizer\n        validation_flag (bool): Flag to indicate if the validation dataset\n            is provided\n        train_dataset (ActorDataset): Training dataset\n        train_dataloader (DataLoader): Training dataloader\n        validation_dataset (ActorDataset): Validation dataset\n        validation_dataloader (DataLoader): Validation dataloader\n        scheduler (torch.optim.lr_scheduler): Learning rate scheduler\n        training_stats (TrainingStats): Training statistics\n        model_engine (ModelEngine): Model engine for deepspeed training\n        accelerator (Accelerator): Accelerator for accelerate training\n\n    Methods:\n        train: Train the actor model\n        load_checkpoint: Load a checkpoint\n        save_checkpoint: Save a checkpoint\n    \"\"\"\n\n    def __init__(self, config: ConfigActor) -> None:\n\n        # store config\n        self.config = config\n\n        # load the model\n        self.actor = ActorModel(config)\n\n        # define loss function\n        self.loss_function = torch.nn.CrossEntropyLoss()\n\n        # define optimizer\n        self.optimizer = torch.optim.AdamW(\n            self.actor.parameters(), lr=config.lr, weight_decay=1e-5\n        )\n\n        # check if validation dataset is provided\n        self.validation_flag = False\n        if config.validation_dataset_path is not None:\n            self.validation_flag = True\n\n        # create dataset and dataloaders\n        self.train_dataset = ActorDataset(config.train_dataset_path)\n        self.train_dataloader = DataLoader(\n            self.train_dataset, batch_size=config.batch_size\n        )\n        if self.validation_flag:\n            self.eval_dataset = ActorDataset(config.validation_dataset_path)\n            self.validation_dataloader = DataLoader(\n                self.eval_dataset, batch_size=config.batch_size\n            )\n\n        # define scheduler for the learning rate\n        # learning rate is decreased until 10% of the initial value\n        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(\n            self.optimizer,\n            T_0=len(self.train_dataset) // config.batch_size,\n            T_mult=1,\n            eta_min=config.lr * 0.1,\n        )\n\n        # define training statistics\n        stat_path = ModelLoader.get_training_stats_path(config)\n        self.training_stats = TrainingStats(stat_path)\n\n        # consistency check between accelerate and deepspeed\n        if config.accelerate_enable and config.deepspeed_enable:\n            raise ValueError(\n                \"Both DeepSpeed and Accelerate are enabled for the Actor.\"\n                \"Please choose one of them.\"\n            )\n\n        # initialize deepspeed\n        self.model_engine = None\n        if config.deepspeed_enable is True:\n            if config.deepspeed_config_path is None:\n                raise ValueError(\n                    \"DeepSpeed config path is None, but deepspeed is enabled\"\n                )\n            if os.path.exists(config.deepspeed_config_path) is False:\n                raise ValueError(\n                    f\"DeepSpeed config path {config.deepspeed_config_path}\"\n                    f\"does not exist\"\n                )\n            (\n                self.model_engine,\n                self.optimizer,\n                self.train_dataloader,\n                _,\n            ) = deepspeed.initialize(\n                args=None,\n                model=self.actor,\n                model_parameters=self.actor.parameters(),\n                training_data=self.train_dataset,\n                config=self.config.deepspeed_config_path,\n            )\n            print(\"Training with DeepSpeed\")\n\n        # initialize accelerate\n        self.accelerator = None\n        if config.accelerate_enable is True:\n            self.accelerator = Accelerator()\n            (\n                self.actor,\n                self.optimizer,\n                self.train_dataloader,\n                self.scheduler,\n            ) = self.accelerator.prepare(\n                self.actor,\n                self.optimizer,\n                self.train_dataloader,\n                self.scheduler,\n            )\n            print(\"Training with Accelerate\")\n\n    @beartype\n    def save_checkpoint(\n        self,\n        current_epoch: int,\n        current_step: int,\n        max_epochs: int,\n        max_steps: int,\n    ) -> None:\n        \"\"\"Save the current checkpoint\n\n        Args:\n            current_epoch (int): Current epoch\n            current_step (int): Current step\n            max_epochs (int): Maximum number of epochs\n            max_steps (int): Maximum number of steps\n        \"\"\"\n\n        print(\n            f\"Saving checkpoint for epoch {current_epoch + 1}, \"\n            f\"step {current_step + 1} ...\"\n        )\n        # look for path to save the checkpoint\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=self.config,\n            is_checkpoint=True,\n            current_epoch=current_epoch,\n            current_step=current_step,\n            max_epochs=max_epochs,\n            max_steps=max_steps,\n        )\n\n        # remove the checkpoint if it already exists\n        if os.path.exists(path):\n            if self.config.deepspeed_enable:\n                shutil.rmtree(path)\n            else:\n                os.remove(path)\n\n        if self.config.deepspeed_enable:\n            client_state = {\n                \"epoch\": current_epoch,\n                \"step\": current_step,\n            }\n            self.model_engine.save_checkpoint(path, client_state=client_state)\n        else:\n            # save the checkpoint\n            torch.save(\n                {\n                    \"state_dict\": self.actor.model.state_dict(),\n                    \"optim_state_dict\": self.optimizer.state_dict(),\n                    \"training_stats\": self.training_stats,\n                    \"epoch\": current_epoch,\n                    \"step\": current_step,\n                },\n                path,\n            )\n\n        # remove old checkpoints\n        n_checkpoints_to_keep = self.config.n_checkpoints_to_keep\n        ModelLoader.delete_old_checkpoints(\n            model_folder, model_name, n_checkpoints_to_keep\n        )\n\n    @beartype\n    def load_checkpoint(\n        self,\n    ) -> Tuple[int, int]:\n        \"\"\"Load a checkpoint from the model folder\n\n        Returns:\n            Tuple[int, int]: Current epoch and current step to resume\n                training\n        \"\"\"\n\n        print(\"Looking for checkpoints...\")\n        # look for a checkpoint\n        path = ModelLoader.check_model_path(\n            config=self.config,\n            is_checkpoint=True,\n            current_epoch=None,\n        )\n\n        # if there is a checkpoint\n        if path is not None:\n            print(\"Loading ...\")\n\n            if self.config.deepspeed_enable:\n                # try to load the checkpoint\n                try:\n                    _, client_state = self.model_engine.load_checkpoint(path)\n                except Exception:\n                    print(\n                        \"Checkpoint corrupted!\"\n                        \"Try to remove the last checkpoint.\"\n                        \"Now Starting from epoch 0, step 0\"\n                    )\n                    return 0, 0\n                # load epoch and step to resume loops\n                epoch = client_state[\"epoch\"]\n                step = client_state[\"step\"]\n            else:\n                # try to load the checkpoint\n                try:\n                    checkpoint = torch.load(path)\n                except Exception:\n                    print(\n                        \"Checkpoint corrupted!\"\n                        \"Try to remove the last checkpoint.\"\n                        \"Now Starting from epoch 0, step 0\"\n                    )\n                    return 0, 0\n\n                # assing the checkpoint to the model\n                epoch = checkpoint[\"epoch\"]\n                self.actor.model.load_state_dict(checkpoint[\"state_dict\"])\n                self.optimizer.load_state_dict(checkpoint[\"optim_state_dict\"])\n                self.trainign_stats = checkpoint[\"training_stats\"]\n                step = checkpoint[\"step\"]\n                return epoch, step + 1  # return the next episode to train\n        return 0, 0\n\n    def add_eos_token(\n        self, tokens: torch.Tensor, mask: torch.Tensor\n    ) -> Tuple[torch.Tensor, torch.Tensor]:\n        # given tokens and mask, add eos token to the end of each sequence\n        # and update the mask\n        batch_size, seq_len = tokens.shape\n        eos_token = self.actor.tokenizer.eos_token_id\n\n        # see if i can append 1 token\n        n_tokens_to_append = min(self.config.max_sequence_length - seq_len, 1)\n        n_tokens_to_append = max(n_tokens_to_append, 0)\n\n        # concatenate eos to tokens and mask\n        if n_tokens_to_append > 0:\n            tokens = torch.cat(\n                [\n                    tokens,\n                    torch.ones(batch_size, n_tokens_to_append)\n                    .long()\n                    .to(tokens.device)\n                    * eos_token,\n                ],\n                dim=1,\n            )\n            mask = torch.cat(\n                [\n                    mask,\n                    torch.ones(batch_size, n_tokens_to_append)\n                    .long()\n                    .to(mask.device),\n                ],\n                dim=1,\n            )\n        return tokens, mask\n\n    def train(\n        self,\n    ) -> None:\n        \"\"\"Train the model\"\"\"\n        print(\"Start Actor Model Pretraining\")\n\n        # get config parameters\n        if self.config.deepspeed_enable:\n            batch_size = self.train_dataloader.batch_size\n        else:\n            batch_size = self.config.batch_size\n        epochs = self.config.epochs\n        device = self.config.device\n        checkpoint_steps = self.config.checkpoint_steps\n\n        # compute the number of iterations\n        n_iter = int(len(self.train_dataset) / batch_size)\n\n        # load model_checkpoint\n        start_epoch, start_step = self.load_checkpoint()\n\n        if start_epoch == 0 and start_step == 0:\n            self.training_stats.clear()\n\n        # counter for the checkpoint\n        cnt_checkpoint = 1\n\n        # traing loop\n        for epoch in range(start_epoch, epochs):\n            self.actor.train()\n            for i, input_text in enumerate(self.train_dataloader):\n\n                # skip the first steps if we are resuming training\n                if i < start_step:\n                    continue\n\n                # tokenize input\n                with torch.no_grad():\n                    input_tokenized = self.actor.tokenizer(\n                        input_text,\n                        return_tensors=\"pt\",\n                        truncation=True,\n                        padding=True,\n                    )\n\n                    # split tokens and mask\n                    input_tokenized_id = input_tokenized[\"input_ids\"]\n                    input_tokenized_mask = input_tokenized[\"attention_mask\"]\n\n                    # add eos token\n                    (\n                        input_tokenized_id,\n                        input_tokenized_mask,\n                    ) = self.add_eos_token(\n                        input_tokenized_id,\n                        input_tokenized_mask,\n                    )\n\n                    # split into input and output\n                    training_output = input_tokenized_id[:, 1:]\n                    training_input = input_tokenized_id[:, :-1]\n                    attention_mask = input_tokenized_mask[:, :-1]\n\n                    # move to device\n                    training_output = training_output.to(device)\n                    training_input = training_input.to(device)\n                    attention_mask = attention_mask.to(device)\n\n                # forward pass\n                if self.config.deepspeed_enable:\n                    est_output = self.model_engine(\n                        training_input, attention_mask\n                    )\n                else:\n                    est_output = self.actor(training_input, attention_mask)\n\n                # compute loss\n                est_output = rearrange(est_output, \"b s v -> (b s) v\")\n                training_output = rearrange(training_output, \"b s -> (b s)\")\n                loss = self.loss_function(est_output, training_output)\n                self.training_stats.training_loss.append(loss.item())\n\n                # backward pass\n                if self.config.deepspeed_enable:\n                    self.model_engine.backward(loss)\n                    self.model_engine.step()\n                elif self.config.accelerate_enable:\n                    self.optimizer.zero_grad()\n                    self.accelerator.backward(loss)\n                    self.optimizer.step()\n                    self.scheduler.step()\n                else:\n                    self.optimizer.zero_grad()\n                    loss.backward()\n                    self.optimizer.step()\n                    self.scheduler.step()\n\n                # print progress\n                if i % self.config.iteration_per_print == 0:\n                    print(\n                        f\"Epoch: {epoch+1}/{epochs}, \"\n                        f\"Iteration: {i+1}/{n_iter}, \"\n                        f\"Training Loss: {loss}\"\n                    )\n\n                # save checkpoint periodically\n                if cnt_checkpoint % checkpoint_steps == 0:\n                    self.save_checkpoint(epoch, i, epochs, n_iter)\n                    self.training_stats.save()\n                    cnt_checkpoint = 1\n                else:\n                    cnt_checkpoint += 1\n\n            # Validation\n            if self.validation_flag:\n                self.actor.eval()\n                with torch.no_grad():\n                    for i, input_text in enumerate(self.validation_dataloader):\n\n                        # tokenize input\n                        input_tokenized = self.actor.tokenizer(\n                            input_text, return_tensors=\"pt\", padding=True\n                        )\n                        validation_output = input_tokenized[\"input_ids\"][:, 1:]\n                        validation_input = input_tokenized[\"input_ids\"][:, :-1]\n                        attention_mask = input_tokenized[\"attention_mask\"][\n                            :, :-1\n                        ]\n\n                        # forward pass\n                        est_output = self.actor.forward(\n                            validation_input, attention_mask\n                        )\n                        validation_output = rearrange(\n                            validation_output, \"b s -> (b s)\"\n                        )\n\n                        # compute loss\n                        est_output = rearrange(est_output, \"b s v -> (b s) v\")\n                        loss = self.loss_function(\n                            est_output, validation_output\n                        )\n                        self.training_stats.validation_loss.append(loss.item())\n\n                        # print progress\n                        if i % self.config.iteration_per_print == 0:\n                            print(\n                                f\"Epoch: {epoch+1}/{epochs}, \"\n                                f\"Iteration: {i+1}/{n_iter}, \"\n                                f\"Validation Loss: {loss}\"\n                            )\n            # reset start_step after training is resumed\n            start_step = 0\n\n        # save the model\n        self.actor.save()\n        print(\"Training Finished \")\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/config.py",
    "content": "import yaml\nimport os\nfrom dataclasses import dataclass\n\nimport torch\nfrom beartype import beartype\nfrom beartype.typing import Optional\n\n\n@dataclass\nclass ConfigReward:\n    \"\"\"Config parameters for the reward model\n\n    Attributes:\n        device (torch.device): Device to be used for the reward model\n        model (str): Model to be used for the reward model\n        model_folder (str): Path to the folder where model are stored (used\n            to load / store finetuned model or checkpoints)\n        model_head_hidden_size (int): Hidden size of the reward model head\n        max_sequence_length (int): Max sequence length of the reward model\n        train_dataset_path (Optional[str]): Path to the training dataset.\n            Default to None. To be specified only for the reward model trainig.\n        validation_dataset_path (Optional[str]): Path to the validation\n            dataset. Default to None. To be specified only for the reward\n            model trainig.\n        batch_size (Optional[int]): Batch size to train the reward model.\n            Default to None. To be specified only for the reward model\n            trainig.\n        epochs (Optional[int]): Number of epochs to train the reward model.\n            Default to None. To be specified only for the reward model\n            trainig.\n        iteration_per_print (Optional[int]): Number of iterations to print\n            the training loss. Default to None. To be specified only for the\n            reward model trainig.\n        checkpoint_steps (Optional[int]): Number of steps (backProp) to\n            interleave checkpoints. Default to None. To be specified only for\n            the reward model trainig.\n        checkpoint_name (Optional[str]): Name of the checkpoint. Default to\n            None.\n        lr (Optional[float]): Learning rate for the reward model. Default to\n            None. To be specified only for the reward model distillation.\n        llm_enable (bool): Enable reward model distillation. Default to True.\n            Disable it if you dont have an API key.\n        llm_model (Optional[str]): Model to be used for the reward model\n            distillation. Default to \"text-davinci-003\".\n        llm_temperature (Optional[float]): Temperature for the reward model\n            distillation. Default to 0.9.\n        llm_max_tokens (Optional[int]): Max tokens for the reward model\n            distillation. Default to 64.\n        deepspeed_enable (bool): Enable deepspeed for the reward model\n            training. Default to False.\n        deepspeed_config_path (str): Path to the deepspeed config file.\n            Default to None.\n        is_reward (bool): True if the model is a reward model. Default to True.\n        accelerate_enable (bool): Enable accelerate for the reward model\n        debug (bool): enable prints for Debugging\n    \"\"\"\n\n    device: torch.device\n    model: str\n    model_folder: str\n    model_head_hidden_size: int\n    max_sequence_length: int\n    train_dataset_path: Optional[str] = None\n    validation_dataset_path: Optional[str] = None\n    batch_size: Optional[int] = None\n    epochs: Optional[int] = None\n    iteration_per_print: Optional[int] = None\n    checkpoint_steps: Optional[int] = None\n    checkpoint_name: Optional[str] = None\n    lr: Optional[float] = None\n    llm_enable: Optional[bool] = False\n    llm_model: Optional[str] = \"text-davinci-003\"\n    llm_temperature: Optional[float] = 0.9\n    llm_max_tokens: Optional[int] = 64\n    deepspeed_enable: bool = False\n    deepspeed_config_path: Optional[str] = None\n\n    # critic specific parameters\n    is_reward: bool = True\n    accelerate_enable: bool = False\n\n    debug: bool = False\n\n\n# just for naming consistency\nConfigCritic = ConfigReward\n\n\n@dataclass\nclass ConfigActor:\n    \"\"\"Config parameters for models\n\n    Attributes:\n        model (str): Model to be used for the actor\n        model_folder (str): Path to the folder where model are stored (used\n            to load / store finetuned model or checkpoints)\n        tokenizer_path (str): Path to the folder where tokenizer are stored\n        train_dataset_path (str): Path to the training dataset\n        validation_dataset_path (Optional[str]): Path to the validation dataset\n        froze_embeddings (bool): Froze embeddings for the actor\n        use_fairscale (bool): Use fairscale module for the actor instead of\n            pytorch native modules.\n        max_sequence_length (int): Max sequence length for the actor\n        max_tokens (int): Max tokens for actor generation\n        min_tokens (int): Min tokens for actor generation\n        additonal_prompt_tokens (int): Number of tokens to be used as safety\n            to avoid too large sequences and to add a template to the\n            dataset\n        temperature (float): Temperature for the actor\n        batch_size (int): Batch size to train the actor\n        iteration_per_print (int): Number of iterations to print the\n            training loss\n        lr (float): Learning rate for the actor\n        epochs (int): Number of epochs to train the actor\n        checkpoint_steps (int): Number of steps (backProp) to interleave\n            checkpoints.\n        n_checkpoints_to_keep (int): Number of checkpoints to keep\n            for the actor.\n        deepspeed_enable (bool): Enable deepspeed for the actor.\n            Default to False.\n        deepspeed_config_path (str): Path to the deepspeed config file.\n            Default to None.\n        accelerate_enable (bool): Enable accelerate for the actor\n        device (torch.device): Device to be used for the actor\n        checkpoint_name (Optional[str]): Name of the checkpoint. Default to\n            None.\n        peft_enable (bool): Enable peft for the actor\n        peft_config_path (str): Path to the peft config file.\n        debug (bool): Enable prints for debugging\n\n    \"\"\"\n\n    model: str\n    model_folder: str\n    tokenizer_path: str\n    train_dataset_path: str\n    validation_dataset_path: Optional[str]\n    froze_embeddings: bool\n    use_fairscale: bool\n    max_sequence_length: int\n    max_tokens: int\n    min_tokens: int\n    additonal_prompt_tokens: int\n    temperature: float\n    batch_size: int\n    iteration_per_print: int\n    lr: float\n    epochs: int\n    checkpoint_steps: int\n    n_checkpoints_to_keep: int\n\n    deepspeed_enable: bool\n    deepspeed_config_path: Optional[str]\n\n    accelerate_enable: bool\n\n    device: torch.device\n    peft_enable: bool\n    peft_config_path: str\n    checkpoint_name: Optional[str] = None\n    debug: bool = False\n\n\n@dataclass\nclass ConfigTrainer:\n    \"\"\"Config parameters for the trainer, used to configure the reinforcement\n    learning training loop\n\n    Attributes:\n        actor_lr (float): Learning rate for the actor when training with\n            reinforcement learning\n        critic_lr (float): Learning rate for the critic when training with\n            reinforcement learning\n        actor_eps_clip (float): Epsilon clip for the actor\n        critic_eps_clip (float): Epsilon clip for the critic\n        beta_s (float): Beta for the actor and critic\n        gamma (float): coefficient for the discounted rewards.\n        examples_path (str): Path to the examples dataset\n        num_episodes (int): Number of episodes, each episodes consist of\n            a number of timesteps that are used to generate examples\n            stored in the memory buffer.\n        max_timesteps (int): Max timesteps for the actor and critic.\n            for each timestep a set of examples are sampled and used to\n            generate a completion and a reward.\n        update_timesteps (int): Number of timesteps to update the actor and\n            critic\n        num_examples (int): Number of examples to generate for the actor\n            and critic. For each iteration of timestep, num_examples are\n            sampled from the prompt dataset, processed and stored in the\n            memory buffer.\n        batch_size (int): Batch size to train the actor and critic.\n            This batch is used to aggregate the memory from the memory buffer\n            for the actual training of the actor and critic models.\n        epochs (int): Number of epochs to train the actor and critic.\n        checkpoint_steps (int): Number of episodes to interleave checkpoints.\n        device (torch.device): Device to be used for the actor and critic\n        checkpoint_name (Optional[str]): Name of the checkpoint. Default to\n            None.\n    \"\"\"\n\n    actor_lr: int\n    critic_lr: int\n    actor_eps_clip: float\n    critic_eps_clip: float\n    beta_s: float\n    gamma_discounted: float\n    examples_path: str\n    num_episodes: int\n    max_timesteps: int\n    update_timesteps: int\n    num_examples: int\n    batch_size: int\n    epochs: int\n    checkpoint_steps: int\n    device: torch.device\n    checkpoint_name: Optional[str] = None\n    debug: bool = False\n\n\nclass Config:\n    \"\"\"Store the config parameters for the whole pipeline\n\n    Args:\n        trainer_dict (Optional[Dict]): Dictionary with the config parameters\n            for the trainer. Default to None. If None, the config.yaml is\n            used.\n        actor_dict (Optional[Dict]): Dictionary with the config parameters\n            for the actor. Default to None. If None, the config.yaml is\n            used.\n        critic_dict (Optional[Dict]): Dictionary with the config parameters\n            for the critic. Default to None. If None, the config.yaml is\n            used.\n        reward_dict (Optional[Dict]): Dictionary with the config parameters\n            for the reward. Default to None. If None, the config.yaml is\n            used.\n        device (Optional[torch.device]): Device to be used for the actor\n            and critic. Default to None. If None, the device available is\n            used.\n        debug (Optional[bool]): Enable prints for debugging. Default to False.\n\n    Attributes:\n        trainer (ConfigTrainer): Config parameters for the trainer\n        actor (ConfigActor): Config parameters for the actor\n        critic (ConfigCritic): Config parameters for the critic\n        reward (ConfigReward): Config parameters for the reward\n    \"\"\"\n\n    @beartype\n    def __init__(\n        self,\n        path: str,\n        device: Optional[torch.device] = None,\n        debug: Optional[bool] = False,\n    ) -> None:\n\n        # if not specified use the device available\n        if device is None:\n            if torch.cuda.is_available():\n                device = torch.device(\"cuda\")\n            else:\n                raise ValueError(\"No GPU available\")\n            print(f\"Current device used :{str(device)}\")\n\n        if path is None or os.path.exists(path) is False:\n            raise ValueError(\"Path to the config.yaml is not valid\")\n\n        # Read the config from yaml\n        with open(path, \"r\") as c:\n            config = yaml.safe_load(c)\n\n        trainer_dict = config[\"trainer_config\"]\n        actor_dict = config[\"actor_config\"]\n        critic_dict = config[\"critic_config\"]\n        reward_dict = config[\"reward_config\"]\n\n        # Trainer Config\n        trainer_dict[\"device\"] = device\n        trainer_dict[\"debug\"] = debug\n        self.trainer = ConfigTrainer(**trainer_dict)\n        # Actor Config\n        actor_dict[\"device\"] = device\n        actor_dict[\"debug\"] = debug\n        self.actor = ConfigActor(**actor_dict)\n        # Critic Config\n        critic_dict[\"device\"] = device\n        critic_dict[\"debug\"] = debug\n        self.critic = ConfigCritic(**critic_dict)\n        self.critic.is_reward = False\n        # Reward Config\n        reward_dict[\"device\"] = device\n        reward_dict[\"debug\"] = debug\n        self.reward = ConfigReward(**reward_dict)\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/dataset.py",
    "content": "import json\nimport os\n\nimport numpy as np\n\nfrom beartype.typing import Dict, List, Union\nfrom datasets import load_dataset\nfrom chatllama.rlhf.config import Config, ConfigActor, ConfigReward\nfrom chatllama.rlhf.reward import RewardModel, CriticModel\nfrom chatllama.rlhf.actor import ActorModel\n\n\nConfigType = Union[Config, ConfigActor, ConfigReward]\n\n\nclass BaseDataset:\n    def __init__(\n        self,\n    ) -> None:\n        pass\n\n    @staticmethod\n    def sort_conversation(\n        conversations: List[Dict],\n        only_input: bool = False,\n        reverse: bool = True,\n        shuffle: bool = True,\n    ) -> List[Dict]:\n        \"\"\"Sort the conversations by length of user_input + completion\n        or by length of user_input only\n\n        Args:\n            conversations (List[Dict]): list of conversations\n            only_input (bool, optional): sort by length of user_input only.\n                Defaults to False.\n            reverse (bool, optional): sort in descending order.\n                Defaults to True.\n            shuffle (bool, optional): shuffle the dataset leaving only the\n                first 100 samples sorted. Defaults to True.\n\n        Returns:\n            List[Dict]: sorted list of conversations\n        \"\"\"\n\n        # define the sorting function\n        if only_input is True:\n\n            def sort_fun(x):\n                return len(x[\"user_input\"])\n\n        else:\n\n            def sort_fun(x):\n                return len(x[\"user_input\"]) + len(x[\"completion\"])\n\n        # sort\n        conversations = sorted(\n            conversations,\n            key=sort_fun,\n            reverse=reverse,\n        )\n\n        # shuffle\n        if shuffle is True:\n            conversations = (\n                conversations[:10]\n                + np.random.choice(\n                    conversations[10:],\n                    size=len(conversations[10:]),\n                    replace=False,\n                ).tolist()\n            )\n\n        return conversations\n\n    @staticmethod\n    def take_n_samples(\n        conversations: List[Dict],\n        n: int,\n    ) -> List[Dict]:\n        \"\"\"Take N samples from the dataset\n\n        Args:\n            conversations (List[Dict]): list of conversations\n            n (int): number of samples to take randomly\n\n        Returns:\n            List[Dict]: list of N samples\n        \"\"\"\n\n        # sample N number of index from 0 to len(conversations)\n        indexes = np.random.choice(len(conversations), size=n, replace=False)\n        # take the samples\n        conversations = [conversations[i] for i in indexes]\n        return conversations\n\n    @staticmethod\n    def clean_dataset(config: ConfigType):\n        \"\"\"Clean the datasets by removing too long examples\n        The Reward Dataset constraints are:\n        - user_input + completion < Reward model max sequence length\n        The Actor Dataset constraints are:\n        - user_input + completion < Actor model max sequence length\n        The RLHF Training Dataset constraints are:\n        - user_input + min_completion < Actor model max sequence length\n        - user_input + min_completion < Critic model max sequence length\n        - user_input + min_completion < Reward model max sequence length\n\n        Args:\n            config (Config): config object\n        \"\"\"\n\n        if isinstance(config, Config):\n            print(\"Start cleaning the dataset for RLHF\")\n            # constraints\n            r_model_max_seq_len = config.reward.max_sequence_length\n            a_model_max_seq_len = config.actor.max_sequence_length\n            c_model_max_seq_len = config.critic.max_sequence_length\n            min_completion = config.actor.min_tokens\n            # dataset\n            dataset_path = config.trainer.examples_path\n            # tokenizers\n            r_tokenizer = RewardModel.load_tokenizer(config.reward)\n            a_tokenizer = ActorModel.load_tokenizer(config.actor)\n            c_tokenizer = CriticModel.load_tokenizer(config.critic)\n            # safety tokens\n            safety_tokens = config.actor.additonal_prompt_tokens\n\n        elif isinstance(config, ConfigActor):\n            print(\"Start cleaning the dataset for Actor\")\n            # constraint\n            a_model_max_seq_len = config.max_sequence_length\n            # dataset\n            dataset_path = config.train_dataset_path\n            # tokenizer\n            a_tokenizer = ActorModel.load_tokenizer(config)\n            # safety tokens\n            safety_tokens = config.additonal_prompt_tokens\n\n        elif isinstance(config, ConfigReward):\n            print(\"Start cleaning the dataset for Reward\")\n            # constraint\n            r_model_max_seq_len = config.max_sequence_length\n            # dataset\n            dataset_path = config.train_dataset_path\n            # tokenizer\n            r_tokenizer = RewardModel.load_tokenizer(config)\n\n        # if there is the datasets\n        if os.path.exists(dataset_path):\n\n            # load the dataset\n            with open(dataset_path, \"r\") as f:\n                conversations = json.load(f)\n\n            # sort in desceding order - longest first\n            if isinstance(config, Config):\n                conversations = BaseDataset.sort_conversation(\n                    conversations,\n                    only_input=True,\n                    reverse=True,\n                )\n            else:\n                conversations = BaseDataset.sort_conversation(\n                    conversations,\n                    only_input=False,\n                    reverse=True,\n                )\n\n            old_len = len(conversations)\n            # remove too long examples\n            # since datasets are ordered by the length\n            # we can remove the first elements until we find\n            # an example that is not too long\n            while len(conversations) > 0:\n\n                # get the text to be tokenized\n                if isinstance(config, Config):\n                    text = conversations[0][\"user_input\"]\n                else:\n                    text = (\n                        conversations[0][\"user_input\"]\n                        + conversations[0][\"completion\"]\n                    )\n\n                # remove elements from RLHF dataset\n                if isinstance(config, Config):\n                    a_tokens = a_tokenizer.encode(text, truncation=False)\n                    r_tokens = r_tokenizer.encode(text, truncation=False)\n                    c_tokens = c_tokenizer.encode(text, truncation=False)\n                    if (\n                        len(a_tokens) + min_completion + safety_tokens\n                        > a_model_max_seq_len\n                    ):\n                        conversations.pop(0)\n                    elif (\n                        len(r_tokens) + min_completion + safety_tokens\n                        > r_model_max_seq_len\n                    ):\n                        conversations.pop(0)\n                    elif (\n                        len(c_tokens) + min_completion + safety_tokens\n                        > c_model_max_seq_len\n                    ):\n                        conversations.pop(0)\n                    else:\n                        break\n\n                # remove elements from Actor dataset\n                elif isinstance(config, ConfigActor):\n                    tokens = a_tokenizer.encode(text, truncation=False)\n                    if len(tokens) + safety_tokens > a_model_max_seq_len:\n                        conversations.pop(0)\n                    else:\n                        break\n\n                # remove elements from Reward dataset\n                elif isinstance(config, ConfigReward):\n                    tokens = r_tokenizer.encode(text, truncation=False)\n                    if len(tokens) > r_model_max_seq_len:\n                        conversations.pop(0)\n                    else:\n                        break\n\n            # if the number of examples has changed\n            if len(conversations) != old_len:\n                print(\"Number of examples before cleaning: \", old_len)\n                print(\n                    \"Number of examples after cleaning: \", len(conversations)\n                )\n\n                # remove the old dataset\n                os.remove(dataset_path)\n\n                # save the new dataset\n                with open(dataset_path, \"w\") as f:\n                    json.dump(conversations, f, indent=4)\n            else:\n                print(\"Dataset is already clean\")\n\n        else:\n            print(\n                f\"Dataset not found at {dataset_path}\"\n                f\" Skipping cleaning of the dataset\"\n            )\n\n\nclass StanfordNLPSHPDataset(BaseDataset):\n    \"\"\"Class for Stanford NLP SHP dataset from HuggingFace\"\"\"\n\n    def __init__(\n        self,\n    ) -> None:\n        print(\"Download the dataset\")\n        self.dataset = load_dataset(\"stanfordnlp/SHP\")\n        print(\"Download Completed\")\n\n    def reformat_dataset(self, data: List) -> List[Dict]:\n        \"\"\"Reformat the dataset to the format required by RLHF\n\n        Args:\n            data (List): dataset from HuggingFace\n\n        Returns:\n            List[Dict]: reformatted dataset\n        \"\"\"\n\n        # initialize conversations\n        conversations = []\n\n        # loop over the dataset\n        for i, d in enumerate(data):\n            if d[\"score_A\"] > d[\"score_B\"]:\n                response = d[\"human_ref_A\"]\n            else:\n                response = d[\"human_ref_B\"]\n\n            # compose user_input template\n            user_input = d[\"history\"].rstrip(\"\\n\")\n            user_input = \"Human: \" + d[\"history\"] + \"\\n\\n##\\n\\n\"\n\n            # compose completion template\n            completion = \"Assistant: \" + response\n            conv = {\n                \"user_input\": user_input,\n                \"completion\": completion,\n                \"score\": None,\n            }\n            conversations.append(conv)\n\n        return conversations\n\n    def save_dataset(\n        self, dataset_folder: str, number_of_samples: int, reverse: bool = True\n    ) -> None:\n        \"\"\"Save the dataset in the format required by RLHF\n\n        Args:\n            dataset_folder (str): path to the folder where the dataset\n                will be saved\n            number_of_samples (int): number of samples to take from the\n                dataset\n            reverse (bool, optional): sort the dataset in descending order.\n                Defaults to True.\n        \"\"\"\n\n        print(\"Generate datasets for RLHF\")\n\n        # take the train and test dataset to create the finetuning dataset\n        conversations = self.reformat_dataset(self.dataset[\"train\"])\n        conversations.extend(self.reformat_dataset(self.dataset[\"test\"]))\n\n        # sort conversations by length of user_input + completion\n        conversations = self.sort_conversation(conversations, reverse=reverse)\n\n        # save actor training data\n        with open(f\"{dataset_folder}/actor_training_data.json\", \"w\") as f:\n            json.dump(conversations, f, indent=4)\n\n        # take N samples and sort them\n        conversations = self.take_n_samples(conversations, number_of_samples)\n        conversations = self.sort_conversation(conversations, reverse=reverse)\n\n        # save reward training data\n        with open(f\"{dataset_folder}/reward_training_data.json\", \"w\") as f:\n            json.dump(conversations, f, indent=4)\n\n        # take the validation dataset for rlhf\n        conversations = self.reformat_dataset(self.dataset[\"validation\"])\n        # sort the validation dataset\n        conversations = self.sort_conversation(\n            conversations,\n            only_input=True,\n            reverse=reverse,\n        )\n        # save rlhf training data\n        with open(f\"{dataset_folder}/rlhf_training_data.json\", \"w\") as f:\n            json.dump(conversations, f, indent=4)\n\n        print(\"Generation Completed\")\n\n\nclass AnthropicRLHF(BaseDataset):\n    def __init__(\n        self,\n    ) -> None:\n\n        print(\"Download the dataset\")\n        self.dataset = load_dataset(\"Anthropic/hh-rlhf\")\n        print(\"Download Completed\")\n\n    def reformat_dataset(self, data: List) -> List[Dict]:\n        \"\"\"Reformat the dataset to the format required by RLHF\n\n        Args:\n            data (List): dataset from HuggingFace\n\n        Returns:\n            List[Dict]: reformatted dataset\n        \"\"\"\n\n        conversations = []\n        for _, d in enumerate(data):\n            current_conv = d[\"chosen\"]\n            split_answer = current_conv.split(\"Assistant:\")\n\n            # take all the list element in split_answer except the last one\n            # and joing them with \"Assistant:\" in a unique string\n            previous_convers = split_answer[0]\n            for i, s in enumerate(split_answer[1:-1]):\n                previous_convers += \"Assistant:\" + s\n\n            # remove the last characters if they are \"\\n\" from the previous\n            # conversation\n            previous_convers = previous_convers.rstrip(\"\\n\")\n            user_input = previous_convers + \"\\n\\n##\\n\\n\"\n            completion = \"Assistant: \" + split_answer[-1]\n\n            conv = {\n                \"user_input\": user_input,\n                \"completion\": completion,\n                \"score\": None,\n            }\n\n            conversations.append(conv)\n        return conversations\n\n    def save_dataset(\n        self, dataset_folder: str, number_of_samples: int, reverse: bool = True\n    ) -> None:\n        \"\"\"Save the dataset in the format required by RLHF\n\n        Args:\n            dataset_folder (str): path to the folder where the dataset\n                will be saved\n            number_of_samples (int): number of samples to take from the\n                dataset\n            reverse (bool, optional): sort the dataset in descending order.\n                Defaults to True.\n        \"\"\"\n\n        print(\"Generate datasets for RLHF\")\n\n        # generate actor and reward dataset\n        conversations = self.reformat_dataset(self.dataset[\"train\"])\n        conversations = self.sort_conversation(conversations, reverse=reverse)\n\n        # save actor training data\n        with open(f\"{dataset_folder}/actor_training_data.json\", \"w\") as f:\n            json.dump(conversations, f, indent=4)\n\n        # sample N number of index from 0 to len(conversations)\n        conversations = self.take_n_samples(conversations, number_of_samples)\n        conversations = self.sort_conversation(conversations, reverse=reverse)\n\n        # save reward training data\n        with open(f\"{dataset_folder}/reward_training_data.json\", \"w\") as f:\n            json.dump(conversations, f, indent=4)\n\n        # rlhf dataset\n        conversations = self.reformat_dataset(self.dataset[\"test\"])\n\n        # sort conversations by length of user_input\n        conversations = self.sort_conversation(\n            conversations, only_input=True, reverse=reverse\n        )\n\n        # save rlhf training data\n        with open(f\"{dataset_folder}/rlhf_training_data.json\", \"w\") as f:\n            json.dump(conversations, f, indent=4)\n\n        print(\"Generation Completed\")\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/model_list.py",
    "content": "# llama models\nllama_models = [\"llama-7B\", \"llama-13B\", \"llama-33B\", \"llama-65B\"]\n\n# HF Models\n# encoder-decoder models TODO: still not supported\nhf_models_seq_2_seq = [\n    \"google/flan-t5-xxl\",\n    \"google/flan-t5-xl\",\n    \"google/flan-t5-large\",\n    \"google/flan-t5-base\",\n    \"google/flan-t5-small\",\n]\n\n# decoder only TODO: codegen is still broken\nhf_models_causal_lm = [\n    \"facebook/opt-125m\",\n    \"facebook/opt-1.3b\",\n    \"facebook/opt-2.7b\",\n    \"facebook/opt-6.7b\",\n    \"facebook/opt-11b\",\n    \"facebook/galactica-125m\",\n    \"facebook/galactica-1.3b\",\n    \"facebook/galactica-6.7b\",\n    \"bigscience/bloom-560m\",\n    \"bigscience/bloomz-560m\",\n    \"bigscience/bloom-1b1\",\n    \"bigscience/bloomz-1b1\",\n    \"bigscience/bloom-1b7\",\n    \"bigscience/bloomz-1b7\",\n    \"bigscience/bloom-3b\",\n    \"bigscience/bloomz-3b\",\n    \"bigscience/bloom-7b1\",\n    \"bigscience/bloomz-7b1\",\n    \"EleutherAI/gpt-neo-1.3B\",\n    \"EleutherAI/gpt-neo-1.3B\",\n    \"EleutherAI/gpt-neox-20b\",\n    \"EleutherAI/gpt-j-6B\",\n    \"gpt2\",\n    \"gpt2-large\",\n    \"gpt2-xl\",\n    \"benjamin/gerpt2\",\n    \"benjamin/gerpt2-large\",\n    \"Salesforce/codegen-350M-mono\",\n    \"Salesforce/codegen-2B-mono\",\n    \"Salesforce/codegen-6B-mono\",\n    \"Salesforce/codegen-16B-mono\",\n]\n\n# create a list of all the models from hf\nhf_models = hf_models_seq_2_seq + hf_models_causal_lm\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/model_loader.py",
    "content": "import os\nimport shutil\n\nfrom beartype.typing import Union, Optional, Tuple\n\nfrom chatllama.rlhf.config import (\n    Config,\n    ConfigActor,\n    ConfigCritic,\n    ConfigReward,\n)\nfrom chatllama.rlhf.model_list import hf_models\n\nConfigType = Union[Config, ConfigActor, ConfigCritic, ConfigReward]\n\n\nclass ModelLoader:\n    \"\"\"Class to load and save models and their checkpoints during training.\"\"\"\n\n    def __init__(\n        self,\n    ) -> None:\n        pass\n\n    @staticmethod\n    def get_training_stats_path(config: ConfigType) -> str:\n        \"\"\"Method to get the path to the training stats file. Used when saving\n\n        Args:\n            config (ConfigType): the config object\n        \"\"\"\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config, is_checkpoint=True\n        )\n        stat_path = os.path.join(model_folder, \"training_stats.json\")\n        return stat_path\n\n    @staticmethod\n    def look_for_last_checkpoint(\n        model_folder: str,\n        model_name: str,\n    ) -> Optional[str]:\n        \"\"\"Method to look for the last checkpoint in the model folder\n        checkpoint are saved as {model_name}_epoch_{current_epoch}.pt\n\n        Args:\n            model_folder (str): the folder where the checkpoints are saved\n            model_name (str): the name of the model\n        \"\"\"\n        # remove .pt to model name\n        model_name = model_name.split(\".\")[0]\n        checkpoints = [\n            f for f in os.listdir(model_folder) if f.startswith(model_name)\n        ]\n        if len(checkpoints) == 0:\n            return None\n        else:\n            checkpoints = sorted(checkpoints)\n            # get last checkpoint\n            last_checkpoint = checkpoints[-1]\n            return last_checkpoint\n\n    @staticmethod\n    def look_for_checkpoint_by_name(\n        model_folder: str,\n        checkpoint_name: str,\n    ) -> Optional[str]:\n        \"\"\"Method to look for a particular checkpoint in the model folder\n        checkpoint are saved as\n        {model_name}_epoch_{current_epoch}_steps_{current_steps}.pt\n\n        Args:\n            model_folder (str): the folder where the checkpoints are saved\n            checkpoint_name (str): the name of the checkpoint\n        \"\"\"\n        # look for a file named checkpoint_name in the model folder\n        path = os.path.join(model_folder, checkpoint_name)\n        if os.path.exists(path):\n            return checkpoint_name\n        else:\n            return None\n\n    @staticmethod\n    def get_checkpoint_name(config: ConfigType) -> str:\n        if isinstance(config, Config):\n            return config.trainer.checkpoint_name\n        else:\n            return config.checkpoint_name\n\n    @staticmethod\n    def get_base_model_folder_from_config(config: ConfigType) -> str:\n        if isinstance(config, ConfigActor) or isinstance(config, ConfigReward):\n            return config.model_folder\n        elif isinstance(config, Config):\n            return config.actor.model_folder\n        else:\n            raise ValueError(\n                \"Config type not recognized during saving or loading\"\n            )\n\n    @staticmethod\n    def get_model_type_from_config(config: ConfigType) -> str:\n        if isinstance(config, ConfigReward):\n            # here use ad-hoc flag from config to distinguish between\n            #  reward and critic\n            if config.is_reward:\n                return \"reward\"\n            else:\n                return \"critic\"\n        elif isinstance(config, ConfigActor):\n            return \"actor\"\n        elif isinstance(config, Config):\n            return \"actor_rl\"\n\n    @staticmethod\n    def get_model_name_from_config(config: ConfigType) -> str:\n        model_name = None\n        if isinstance(config, Config):\n            model_name = config.actor.model\n        elif isinstance(config, ConfigReward) or isinstance(\n            config, ConfigActor\n        ):\n            model_name = config.model\n        if model_name in hf_models:\n            return os.path.split(model_name)[-1]\n        if model_name is None:\n            raise ValueError(\"Model name not found\")\n        return model_name\n\n    @staticmethod\n    def delete_old_checkpoints(\n        model_folder: str, model_name: str, n_ckp_to_keep: int = 5\n    ):\n        \"\"\"Method to discard old checkpoints, keeping only the last\n        n_ckp_to_keep\n\n        Args:\n            model_folder (str): the folder where the checkpoints are saved\n            model_name (str): the name of the model\n            n_ckp_to_keep (int): the number of checkpoints to keep\n        \"\"\"\n\n        # remove .pt to model name\n        model_name = model_name.split(\".\")[0]\n        checkpoints = [\n            f for f in os.listdir(model_folder) if f.startswith(model_name)\n        ]\n        if len(checkpoints) == 0:\n            return\n        else:\n            checkpoints = sorted(checkpoints)\n            # check if the number of checkpoint is greater than 5\n            if len(checkpoints) > n_ckp_to_keep:\n                for c in checkpoints[:-n_ckp_to_keep]:\n                    checkpoint_path = os.path.join(model_folder, c)\n                    os.remove(checkpoint_path)\n\n    @staticmethod\n    def get_model_path(\n        config: ConfigType,\n        is_checkpoint: bool = False,\n        current_epoch: Optional[int] = None,\n        current_step: Optional[int] = None,\n        max_epochs: int = 1_000_000_000,\n        max_steps: int = 1_000_000_000,\n    ) -> Tuple[str, str, Optional[str]]:\n        \"\"\"Method to get the path to the right model file. Used when saving\n        the model.\n        The hierarchy of the model folder is:\n        -- model_folder: here store the models trained, for each type of model\n                        there is a dedicated folder\n            -- actor\n            -- critic\n            -- reward\n            -- actor_rl\n            -- checkpoints: here store the checkpoints during training, for\n                            each type of model there is a dedicated folder\n                -- actor\n                -- critic\n                -- reward\n                -- actor_rl\n\n        Args:\n            config (ConfigType): the config object, contains info of the model\n            is_checkpoint (bool): if True, the path is for a checkpoint\n            current_epoch (Optional[int]): the current epoch, used to create\n                the checkpoint name. If is_checkpoint is True, and\n                current_epoch is None, return just the folder and the simple\n                model name for the possible checkpoint.\n            current_step (Optional[int]): the current step, used to create\n                the checkpoint name.\n            max_epochs (Optional[int]): the maximum number of epochs, used to\n                create the checkpoint name.\n            max_steps (Optional[int]): the maximum number of steps, used to\n                create the checkpoint name.\n\n        Returns:\n            model_folder (str): the folder where the model is saved\n            model_name (str): the name of the model\n            path (Optional[str]): the path to the model. If is_checkpoint is\n                True, and current_epoch is None, return None\n        \"\"\"\n        model_folder = ModelLoader.get_base_model_folder_from_config(config)\n\n        # Add the checkpoint path if necessary\n        if is_checkpoint:\n            model_folder = os.path.join(model_folder, \"checkpoints\")\n\n        # Create the folder for the model type\n        #  (Actor, Critic, Reward, Actor_RL)\n        model_type = ModelLoader.get_model_type_from_config(config)\n        model_folder = os.path.join(model_folder, model_type)\n\n        # Make the path if not exists\n        if os.path.exists(model_folder) is False:\n            os.makedirs(model_folder, exist_ok=True)\n            print(f\"Model folder does not exist. Creating it: {model_folder}\")\n\n        # Create the model name\n        model_name = ModelLoader.get_model_name_from_config(config)\n\n        # If is a checkpoint and current epoch are available\n        # extend the model name with the epoch, if none epoch is provided\n        # just return the simple model name\n        if is_checkpoint and current_epoch is not None:\n            # number of characters to store the checkpoints\n            n_char = max(len(str(max_epochs)), len(str(max_steps)))\n            # create the string epoch such that it is always the same length\n            # equalt to n_char (i.e. 00000001) necessary for sorting\n            string_epoch = str(current_epoch)\n            string_epoch = \"0\" * (n_char - len(string_epoch)) + string_epoch\n            string_epoch = f\"_epoch_{string_epoch}\"\n            if current_step is not None:\n                string_step = str(current_step)\n                string_step = \"0\" * (n_char - len(string_step)) + string_step\n                string_step = f\"_step_{string_step}\"\n                model_name = f\"{model_name}{string_epoch}{string_step}.pt\"\n            else:\n                model_name = f\"{model_name}{string_epoch}.pt\"\n        else:\n            model_name = f\"{model_name}.pt\"\n\n        # if the epoch is not provided, and it is a checkpoint\n        # is impossible to know the path to the file.\n        # but we can know the model folder and the model name\n        if is_checkpoint and current_epoch is None:\n            path = None\n        else:\n            path = os.path.join(model_folder, model_name)\n        return model_folder, model_name, path\n\n    @staticmethod\n    def check_model_path(\n        config: ConfigType,\n        is_checkpoint: bool = False,\n        current_epoch: Optional[int] = None,\n        current_step: Optional[int] = None,\n    ) -> Optional[int]:\n        \"\"\"Method to check if the model path exists to load models\n        or checkpoints.\n\n        Args:\n            config (ConfigType): the config object, contains info of the model\n            is_checkpoint (bool): if True, the path is for a checkpoint\n            current_epoch (Optional[int]): the current epoch.\n                is is_checkpoint is True, and current_epoch is None,\n                it will look for the last checkpoint and return it.\n\n        Returns:\n            path (Optional[str]): the path to the model. If is_checkpoint is\n                True, and current_epoch is None, search for the last checkpoint\n                and return it. If no checkpoint is found, return None.\n            epoch (Optional[int]): the epoch of the checkpoint if an actual\n                checkpoint is found. If no checkpoint is found, return None.\n        \"\"\"\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config,\n            is_checkpoint,\n            current_epoch,\n        )\n\n        # If i am looking for a checkpoint.\n        if is_checkpoint and current_epoch is None:\n            # If the checkpoint is specified by name use it\n            checkpoint_name = ModelLoader.get_checkpoint_name(config)\n            if checkpoint_name is not None:\n                checkpoint = ModelLoader.look_for_checkpoint_by_name(\n                    model_folder, checkpoint_name\n                )\n            else:\n                checkpoint = ModelLoader.look_for_last_checkpoint(\n                    model_folder, model_name\n                )\n            if checkpoint is not None:\n                path = os.path.join(model_folder, checkpoint)\n                # Get the epoch number from the checkpoint name\n\n        if path is not None:\n            if os.path.exists(path) is False:\n                path = None\n\n        if path is None:\n            if is_checkpoint:\n                checkpoint_name = ModelLoader.get_checkpoint_name(config)\n                if checkpoint_name is not None:\n                    print(\n                        f\"No checkpoint found at {model_folder} \"\n                        f\"with name {config.checkpoint_name}\"\n                    )\n                else:\n                    print(\n                        f\"No previous checkpoint found at \"\n                        f\"{model_folder} for {model_name}\"\n                    )\n            else:\n                print(\n                    f\"No previous model found at \"\n                    f\"{model_folder} for model {model_name}\"\n                )\n        else:\n            if is_checkpoint:\n                # the name is modelname_epoch_00000001_step_00000001.pt\n                # or modelname_epoch_00000001.pt\n                if \"_step_\" in path:\n                    epoch = int(path.split(\"_epoch_\")[-1].split(\"_\")[0])\n                    step = int(path.split(\"_step_\")[-1].split(\".\")[0])\n                    print(\n                        f\"Found checkpoint for epoch {epoch + 1},\"\n                        f\" step {step + 1}...\"\n                    )\n                else:\n                    epoch = int(path.split(\"_epoch_\")[-1].split(\".\")[0])\n                    print(f\"Found checkpoint for epoch {epoch + 1} ...\")\n            else:\n                print(f\"Found model at {path}\")\n        return path\n\n    def init_critic_from_reward(config: ConfigCritic) -> None:\n        \"\"\"Method to initialize the critic from the reward model.\n        If the critic folder is empty\n        \"\"\"\n\n        if config.is_reward is True:\n            raise ValueError(\n                \"The config should work for the Critic model,\"\n                \"but the config seems to be for the Reward model\"\n            )\n\n        # check that the critic folder is empty\n        path = ModelLoader.check_model_path(config)\n        _, _, critic_path = ModelLoader.get_model_path(config)\n        if path is None:\n            print(\"Initializing Critic from Reward model...\")\n            config.is_reward = True\n            path = ModelLoader.check_model_path(config)\n            if path is not None:\n                _, _, reward_path = ModelLoader.get_model_path(config)\n                # copy the file in reward_path to critic_path\n                shutil.copy(reward_path, critic_path)\n            else:\n                print(\"Critic Model remains uninitialized\")\n        config.is_reward = False\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/reward.py",
    "content": "import json\nimport shutil\nimport os\n\nimport deepspeed\nimport torch\nfrom accelerate import Accelerator\nfrom beartype import beartype\nfrom beartype.typing import Iterable, Tuple\nfrom einops.layers.torch import Rearrange\nfrom torch.utils.data import Dataset, DataLoader\nfrom transformers import (\n    AutoModel,\n    AutoTokenizer,\n)\n\nfrom chatllama.rlhf.config import ConfigReward\nfrom chatllama.rlhf.model_list import hf_models\nfrom chatllama.rlhf.model_loader import ModelLoader\nfrom chatllama.rlhf.utils import TrainingStats\n\n\nclass RewardModel(torch.nn.Module):\n    \"\"\"Model to be trained to predict the reward for RL.\n    or to be used as Critic in RL. It is a Language Model with a head\n    that predicts the reward (a scalar) for a given sequence of tokens.\n\n    Attributes:\n        model (torch.nn.Module): Model to be used for the reward model\n        tokenizer (torch.nn.Module): Tokenizer to be used for the reward model\n        head (torch.nn.Module): Head to be used for the reward model\n        config (ConfigReward): Config parameters for the reward model\n\n    Methods:\n        load_tokenizer: Load the tokenizer for the reward model\n        forward: Forward pass of the model (used by the critic)\n        save: Save the model\n        load: Load the model\n        get_reward: Get the reward for a given input (used by the reward model)\n        parameters: Return the parameters of the reward model\n\n    \"\"\"\n\n    def __init__(self, config: ConfigReward) -> None:\n        super().__init__()\n\n        # store config\n        self.config = config\n\n        # initialize the self.model\n        head_hidden_size = config.model_head_hidden_size\n        if config.model in hf_models:\n            self.tokenizer = self.load_tokenizer(config)\n            self.model = AutoModel.from_pretrained(config.model)\n            head_dim = self.model.config.hidden_size\n            if config.model.startswith(\"gpt2\"):\n                head_dim = self.model.config.n_embd\n            self.head = torch.nn.Sequential(\n                torch.nn.Linear(head_dim, head_hidden_size),\n                torch.nn.ReLU(),\n                torch.nn.Linear(head_hidden_size, 1),\n                Rearrange(\"... 1 -> ...\"),\n            )\n        else:\n            raise ValueError(f\"Model {config.model} not supported\")\n\n        # load the model\n        self.load()\n\n        # freeze model parameters (only train the head)\n        # for param in self.model.parameters():\n        #     param.requires_grad = False\n\n        # move model to device\n        self.model.to(config.device)\n        self.head.to(config.device)\n\n    @staticmethod\n    def load_tokenizer(config: ConfigReward):\n        # load tokenizer from HF\n        tokenizer = AutoTokenizer.from_pretrained(\n            config.model,\n            padding_side=\"left\",\n            padding=True,\n            truncation=True,\n            model_max_length=config.max_sequence_length,\n        )\n\n        # add eos token if not present\n        if tokenizer.eos_token is None:\n            tokenizer.eos_token = \"</s>\"\n            tokenizer.eos_token_id = 2  # OPT  eos token id\n\n        # add pad token if not present\n        if tokenizer.pad_token is None:\n            tokenizer.pad_token = tokenizer.eos_token\n            tokenizer.pad_token_id = tokenizer.eos_token_id\n        return tokenizer\n\n    @beartype\n    def load(self) -> None:\n        \"\"\"Load the model from the path\"\"\"\n        # look for a pretrained model\n        path = ModelLoader.check_model_path(\n            config=self.config,\n            is_checkpoint=False,\n            current_epoch=None,\n        )\n\n        # check if the model exists\n        if path is not None:\n\n            # load the model from the path\n            print(\"Loading ...\")\n            model_dict = torch.load(path)\n            self.model.load_state_dict(model_dict.get(\"state_dict\") or model_dict.get(\"model\"))\n            self.head.load_state_dict(model_dict[\"head\"])\n\n    @beartype\n    def save(self) -> None:\n        \"\"\"Save the model to the path\"\"\"\n        # get the path to save the model\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=self.config,\n            is_checkpoint=False,\n            current_epoch=None,\n        )\n\n        # save the model\n        print(f\"Saving model to {path} ...\")\n        torch.save(\n            {\"model\": self.model.state_dict(), \"head\": self.head.state_dict()},\n            path,\n        )\n\n    @beartype\n    def parameters(\n        self,\n    ) -> Iterable[torch.nn.Parameter]:\n        \"\"\"Return the parameters of the reward model\"\"\"\n        for p in self.model.parameters():\n            yield p\n        for p in self.head.parameters():\n            yield p\n\n    @beartype\n    def forward(\n        self, output_sequence: torch.Tensor, output_sequence_mask: torch.Tensor\n    ) -> torch.Tensor:\n        \"\"\"Generate the sequence of rewards for the given output sequence\n        what is the quality of the output sequence tokens?\n\n        Args:\n            output_sequence (torch.Tensor): The sequence of tokens to be\n                evaluated\n            output_sequence_mask (torch.Tensor): Mask for the attention\n\n        Returns:\n            torch.Tensor: Rewards for the given output sequence\n        \"\"\"\n        output = self.model(\n            output_sequence, attention_mask=output_sequence_mask\n        )\n\n        # What if the output_sequence is longer than the max context of\n        # the model?\n        rewards = self.head(output.last_hidden_state)\n        if self.config.debug:\n            print(\"RewardModel.forward\")\n            print(\"output_sequence.shape\", output_sequence.shape)\n            print(\"output_sequence\", output_sequence)\n            print(\"reward.shape\", rewards.shape)\n            print(\"reward\", rewards)\n        return rewards\n\n    @beartype\n    def get_reward(\n        self, output_sequence: torch.Tensor, output_sequence_mask: torch.Tensor\n    ) -> torch.Tensor:\n        \"\"\"Get the reward for the given output sequence\n\n        Args:\n            output_sequence (torch.Tensor): The concatenation of initial input\n                and actor output as tokens\n            output_sequence_mask (torch.Tensor): Mask for the attention\n        \"\"\"\n        if output_sequence.shape[1] > self.config.max_sequence_length:\n            raise ValueError(\n                f\"Output sequence is too long: {output_sequence.shape[1]}\"\n                f\" > {self.config.max_sequence_length}\"\n            )\n        rewards = self.forward(output_sequence, output_sequence_mask)\n        return rewards[:, -1]\n\n\n# just to keep namings consistent\nCriticModel = RewardModel\n\n\nclass RewardDataset(Dataset):\n    \"\"\"Dataset class for the reward model\n    read a json file with the following format:\n    [\n        {\n            \"user_input\": \"...\",\n            \"completion\": \"...\",\n            \"score\": ...\n        },\n        ...\n    ]\n    Where:\n        user_input: the initial input of the user\n        completion: the completion generated by the model\n        score: the score given by the user to the completion (or by the LLM)\n    \"\"\"\n\n    def __init__(self, path: str) -> None:\n        print(f\"Loading dataset from {path}\")\n        with open(path, \"r\") as f:\n            self.data = list(json.load(f))\n        print(f\"Loaded {len(self.data)} samples\")\n\n    def __getitem__(self, idx: int):\n        user_input = self.data[idx][\"user_input\"]\n        completion = self.data[idx][\"completion\"]\n        if self.data[idx][\"score\"]:\n            score = float(self.data[idx][\"score\"])\n        else:\n            score = 2.5\n\n        item = (user_input + completion, score)\n        return item\n\n    def __len__(\n        self,\n    ):\n        return len(self.data)\n\n\nclass RewardTrainer:\n    \"\"\"Class to train the reward model\n\n    Args:\n        config (ConfigModel): Config parameters for the model\n\n    Attributes:\n        model (RewardModel): Reward model\n        config (ConfigModel): Config parameters for the model\n        optimizer (torch.optim): Optimizer for the model\n        loss_function (torch.nn): Loss function for the model\n        validation_flag (bool): Flag to indicate if the validation dataset\n            is available\n        train_dataset (RewardDataset): Dataset for training\n        validation_dataset (RewardDataset): Dataset for validation\n        train_dataloader (DataLoader): Dataloader for training\n        validation_dataloader (DataLoader): Dataloader for validation\n        scheduler (torch.optim.lr_scheduler): Scheduler for the optimizer\n        training_stats (List[Dict]): List of dictionaries with the training\n            statistics\n        model_engine (ModelEngine): Model engine to train the model\n            using deepspeed\n        accelerator (Accelerator): Accelerator to train the model using\n            accelerate by HF.\n\n\n    Methods:\n        train: Train the reward model\n        save_checkpoints: Save the checkpoints of the model\n        load_checkpoints: Load the checkpoints of the model\n    \"\"\"\n\n    def __init__(self, config: ConfigReward) -> None:\n\n        # save the config\n        self.config = config\n\n        # load the model\n        self.reward = RewardModel(config)\n\n        # optimizer\n        self.optimizer = torch.optim.AdamW(\n            self.reward.parameters(), lr=config.lr\n        )\n\n        # loss function\n        self.loss_function = torch.nn.MSELoss()\n\n        # check validation dataset\n        self.validation_flag = False\n        if config.validation_dataset_path is not None:\n            self.validation_flag = True\n\n        # create dataset and dataloaders\n        self.train_dataset = RewardDataset(config.train_dataset_path)\n        self.train_dataloader = DataLoader(\n            self.train_dataset, batch_size=config.batch_size\n        )\n        if self.validation_flag:\n            self.eval_dataset = RewardDataset(config.validation_dataset_path)\n            self.validation_dataloader = DataLoader(\n                self.eval_dataset, batch_size=config.batch_size\n            )\n\n        # intilize scheduler - learning rate will drop to 10% of the initial\n        # value\n        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(\n            self.optimizer,\n            T_0=len(self.train_dataset) // config.batch_size,\n            T_mult=1,\n            eta_min=config.lr * 0.1,\n            last_epoch=-1,\n        )\n\n        # initialize training stats\n        stats_path = ModelLoader.get_training_stats_path(config)\n        self.training_stats = TrainingStats(stats_path)\n\n        # consistency check between accelerate and deepspeed\n        if config.accelerate_enable and config.deepspeed_enable:\n            raise ValueError(\n                \"Both DeepSpeed and Accelerate are enabled for the Reward.\"\n                \"Please choose one of them.\"\n            )\n\n        # initialize deepspeed\n        self.model_engine = None\n        if config.deepspeed_enable is True:\n\n            if config.deepspeed_config_path is None:\n                raise ValueError(\n                    \"DeepSpeed config path is None, but deepspeed is enabled\"\n                )\n            if os.path.exists(config.deepspeed_config_path) is False:\n                raise ValueError(\n                    f\"DeepSpeed config path {config.deepspeed_config_path}\"\n                    f\"does not exist\"\n                )\n            (\n                self.model_engine,\n                self.optimizer,\n                self.train_dataloader,\n                self.scheduler,\n            ) = deepspeed.initialize(\n                args=None,\n                model=self.reward,\n                model_parameters=self.reward.parameters(),\n                training_data=self.train_dataset,\n                config=self.config.deepspeed_config_path,\n            )\n            print(\"Training with DeepSpeed\")\n\n        # initialize accelerate\n        self.accelerator = None\n        if config.accelerate_enable is True:\n            self.accelerator = Accelerator()\n            (\n                self.reward,\n                self.optimizer,\n                self.train_dataloader,\n                self.scheduler,\n            ) = self.accelerator.prepare(\n                self.reward,\n                self.optimizer,\n                self.train_dataloader,\n                self.scheduler,\n            )\n            print(\"Training with Accelerate\")\n\n    @beartype\n    def save_checkpoint(\n        self,\n        current_epoch: int,\n        current_step: int,\n        max_epochs: int,\n        max_steps: int,\n    ) -> None:\n        \"\"\"Save the checkpoints of the model\n\n        Args:\n            current_epoch (int): Current epoch\n            current_step (int): Current step\n            max_epochs (int): Maximum number of epochs\n            max_steps (int): Maximum number of steps\n        \"\"\"\n\n        print(\n            f\"Saving checkpoint for epoch {current_epoch + 1}, \"\n            f\" step {current_step} ...\"\n        )\n\n        # get the path to save the checkpoint\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=self.config,\n            is_checkpoint=True,\n            current_epoch=current_epoch,\n            current_step=current_step,\n            max_epochs=max_epochs,\n            max_steps=max_steps,\n        )\n\n        # remove the checkpoint if it already exists\n        if os.path.exists(path):\n            if self.config.deepspeed_enable:\n                shutil.rmtree(path)\n            else:\n                os.remove(path)\n\n        # save the checkpoint\n        if self.config.deepspeed_enable:\n            client_state = {\n                \"epoch\": current_epoch,\n                \"step\": current_step,\n            }\n            self.model_engine.save_checkpoint(path, client_state=client_state)\n        else:\n            torch.save(\n                {\n                    \"state_dict\": self.reward.model.state_dict(),\n                    \"optim_state_dict\": self.optimizer.state_dict(),\n                    \"scheduler_state_dict\": self.scheduler.state_dict(),\n                    \"training_stats\": self.training_stats,\n                    \"epoch\": current_epoch,\n                    \"step\": current_step,\n                },\n                path,\n            )\n\n    @beartype\n    def load_checkpoint(\n        self,\n    ) -> Tuple[int, int]:\n        \"\"\"Load the checkpoints of the model\n\n        Returns:\n            Tuple[int, int]: The current epoch and step\n                from which you should resume the training\n        \"\"\"\n\n        print(\"Looking for checkpoints...\")\n        # look for the checkpoints\n        path = ModelLoader.check_model_path(\n            config=self.config,\n            is_checkpoint=True,\n            current_epoch=None,\n        )\n\n        # check if a checkpoint exists\n        if path is not None:\n            print(\"Loading ...\")\n\n            if self.config.deepspeed_enable:\n                # try to load the checkpoint\n                try:\n                    _, client_state = self.model_engine.load_checkpoint(path)\n                except Exception:\n                    print(\n                        \"Checkpoint corrupted!\"\n                        \"Try to remove the last checkpoint.\"\n                        \"Now Starting from epoch 0, step 0\"\n                    )\n                    return 0, 0\n                # load epoch and step to resume loops\n                epoch = client_state[\"epoch\"]\n                step = client_state[\"step\"]\n            else:\n                # try to load the checkpoint\n                try:\n                    checkpoint = torch.load(path)\n                except Exception:\n                    print(\n                        \"Checkpoint corrupted!\"\n                        \"Try to remove the last checkpoint.\"\n                        \"Now Starting from epoch 0, step 0\"\n                    )\n                    return 0, 0\n\n                # load the model parameters and optimizer parameters\n                # from the checkpoint\n                epoch = checkpoint[\"epoch\"]\n                self.reward.model.load_state_dict(checkpoint[\"state_dict\"])\n                self.optimizer.load_state_dict(checkpoint[\"optim_state_dict\"])\n                self.scheduler.load_state_dict(\n                    checkpoint[\"scheduler_state_dict\"]\n                )\n                self.training_stats = checkpoint[\"training_stats\"]\n                step = checkpoint[\"step\"]\n            return epoch, step + 1  # return the next episode to train\n        return 0, 0\n\n    def train(\n        self,\n    ) -> None:\n        \"\"\"Train the reward model\"\"\"\n        print(\"Start Training the Reward Model\")\n\n        # get config parameters\n        if self.config.deepspeed_enable:\n            batch_size = self.train_dataloader.batch_size\n        else:\n            batch_size = self.config.batch_size\n        epochs = self.config.epochs\n        device = self.config.device\n        iteration_per_print = self.config.iteration_per_print\n        checkpoint_steps = self.config.checkpoint_steps\n\n        # compute the number of iterations\n        n_iter = int(len(self.train_dataset) / batch_size)\n\n        # load checkpoint\n        start_epoch, start_step = self.load_checkpoint()\n\n        # counter for the checkpoint\n        cnt_checkpoints = 1\n\n        # traing loop\n        for epoch in range(start_epoch, epochs):\n            self.reward.train()\n            for i, inputs in enumerate(self.train_dataloader):\n\n                # skip the steps if resuming from a checkpoint\n                if i < start_step:\n                    continue\n\n                # get the inputs\n                input_text = inputs[0]\n                score = inputs[1]\n\n                # tokenize the input\n                with torch.no_grad():\n                    input_tokens = self.reward.tokenizer(\n                        input_text,\n                        return_tensors=\"pt\",\n                        truncation=True,\n                        padding=True,\n                    )\n                    output = torch.as_tensor(\n                        score, dtype=torch.float32, device=device\n                    )\n\n                # forward pass\n                if self.config.deepspeed_enable:\n                    est_output = self.model_engine(\n                        input_tokens[\"input_ids\"].to(device),\n                        input_tokens[\"attention_mask\"].to(device),\n                    )[:, -1]\n                else:\n                    est_output = self.reward.get_reward(\n                        input_tokens[\"input_ids\"].to(device),\n                        input_tokens[\"attention_mask\"].to(device),\n                    )\n\n                # compute the loss\n                loss = self.loss_function(est_output, output)\n                self.training_stats.training_loss.append(loss.item())\n\n                # backward pass\n                if self.config.deepspeed_enable:\n                    self.model_engine.backward(loss)\n                    self.model_engine.step()\n                elif self.config.accelerate_enable:\n                    self.optimizer.zero_grad()\n                    self.accelerator.backward(loss)\n                    self.optimizer.step()\n                    self.scheduler.step()\n                else:\n                    self.optimizer.zero_grad()\n                    loss.backward()\n                    self.optimizer.step()\n                    self.scheduler.step()\n\n                # print progress\n                if i % iteration_per_print == 0:\n                    print(\n                        f\"Epoch: {epoch+1}/{epochs}, \"\n                        f\"Iteration: {i+1}/{n_iter}, \"\n                        f\"Training Loss: {loss.item()}\"\n                    )\n                    printed_est_output = [\n                        round(float(x), 1) for x in est_output.cpu().tolist()\n                    ]\n                    print(\n                        \"prediction\",\n                        printed_est_output,\n                        \"target\",\n                        score.cpu().tolist(),\n                    )\n\n                # checkpoints saving\n                if cnt_checkpoints % checkpoint_steps == 0:\n                    self.save_checkpoint(epoch, i, epochs, n_iter)\n                    cnt_checkpoints = 1\n                else:\n                    cnt_checkpoints += 1\n\n            # Validation\n            if self.validation_flag:\n                self.reward.eval()\n                with torch.no_grad():\n                    for i, (text, score) in enumerate(\n                        self.validation_dataloader\n                    ):\n\n                        # tokenize inputs\n                        input_tokens = self.reward.tokenizer(\n                            text, return_tensors=\"pt\", padding=True\n                        )\n                        input_tokens = input_tokens.to(device)\n                        # TODO: check on the length of the input tokens if\n                        # they are too many it can create problems\n                        output = torch.tensor(score, dtype=torch.float32).to(\n                            device\n                        )\n\n                        # forward pass\n                        est_output = self.reward.get_reward(\n                            input_tokens[\"input_ids\"],\n                            input_tokens[\"attention_mask\"],\n                        )\n\n                        # compute loss\n                        loss = self.loss_function(est_output, output)\n                        self.training_stats.validation_loss.append(loss.item())\n\n                        # print progress\n                        if i % iteration_per_print == 0:\n                            print(\n                                f\"Epoch: {epoch+1}/{epochs}, \"\n                                f\"Iteration: {i+1}/{n_iter}, \"\n                                f\"Validation Loss: {loss.item()}\"\n                            )\n            # reset start_step after training is resumed\n            start_step = 0\n\n        # save the model at the end of the training\n        self.reward.save()\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/trainer.py",
    "content": "import json\nimport os\nimport random\nfrom collections import deque, namedtuple\n\nimport deepspeed\nimport torch\nimport torch.distributed as dist\nfrom accelerate import Accelerator\nfrom beartype import beartype\nfrom beartype.typing import Deque, List, Tuple, Union\nfrom deepspeed.runtime.engine import DeepSpeedEngine\nfrom torch.utils.data import DataLoader, Dataset\nfrom torch.optim.lr_scheduler import CosineAnnealingWarmRestarts\n\nfrom chatllama.rlhf.actor import ActorModel\nfrom chatllama.rlhf.config import (\n    Config,\n    ConfigActor,\n    ConfigCritic,\n    ConfigReward,\n)\nfrom chatllama.rlhf.model_list import hf_models\nfrom chatllama.rlhf.model_loader import ModelLoader\nfrom chatllama.rlhf.reward import RewardModel, CriticModel\nfrom chatllama.rlhf.utils import TrainingStats, ConversationLog\n\n\n\"\"\"\ntrain()\n┌─────────────────────────────┐\n│                             │◄─────────────────────────┐\n│                             │                          │\n│      ┌─────────────┐        │                          │\n│      │ user input  │        │                          │ learn()\n│      └─────┬───────┘        │             ┌────────────┴─────────────┐\n│            │                │             │                          │\n│            │                │             │       ┌────────┐         │\n│            │                │             │   ┌───│ Update │──┐      │\n│            │                │             │   │   └────▲───┘  │      │\n│   ┌────────▼────────────┐   │             │   │        │      │      │\n│   │  Actor (LLM Model)  │   │             │   │     ┌──┴───┐  │      │\n│   └────────┬────────────┘   │             │   │     │ PPO  │  │      │\n│            │                │             │   │     └▲────▲┘  │      │\n│            │                │             │   │      │    │   │      │\n│            │                │             │   │      │    │   │      │\n│    ┌───────▼──────┐         │             │ ┌─▼──────┴┐ ┌─┴───▼──┐   │\n│    │ Reward Model │         │             │ │  Actor  │ │ Critic │   │\n│    └──────────────┘         │             │ └─────────┘ └────────┘   │\n│                             │             │                          │\n│                             │ x Episodes  └─────────────▲────────────┘\n└───────────────┬─────────────┘                           │   x Epochs\n                │ store N Examples per Timestep           │  \n         ┌──────▼──────┐                                  │\n         │             │                                  │\n         │  Memories   ├──────────────────────────────────┘\n         │             │ (update timesteps x N Examples)\n         └─────────────┘\n\"\"\"  # noqa W291\n\n\ndef change_tokenization(tokens, tokenizer1, tokenizer2):\n    \"\"\"Change the tokenizer of the tokens\n\n    Args:\n        tokens (torch.Tensor): Tokens to be changed\n        tokenizer1 (transformers.PreTrainedTokenizer): Tokenizer to be changed\n        tokenizer2 (transformers.PreTrainedTokenizer): Tokenizer to be\n            changed to\n\n    Returns:\n        encoded_tokens: Encoded tokens\n    \"\"\"\n\n    # decode tokens\n    with torch.no_grad():\n        decoded_tokens = [\n            tokenizer1.decode(token) for i, token in enumerate(tokens)\n        ]\n\n        # remove all the pad tokens\n        decoded_tokens = [\n            token.replace(tokenizer1.pad_token, \"\") for token in decoded_tokens\n        ]\n\n        # remove all the eos tokens\n        decoded_tokens = [\n            token.replace(tokenizer1.eos_token, \"\") for token in decoded_tokens\n        ]\n\n        # encode the actions with critic tokenizer\n        encoded_tokens = tokenizer2(\n            decoded_tokens,\n            return_tensors=\"pt\",\n            padding=True,\n            truncation=True,\n        )\n\n    return encoded_tokens\n\n\nConfigType = Union[ConfigActor, ConfigReward, ConfigCritic]\n\n\n@beartype\ndef check_model_family(config1: ConfigType, config2: ConfigType) -> bool:\n    \"\"\"Check if the model family is the same for the two configs\n    the model family is specified in the config.model\n\n    Args:\n        config1 (ConfigType): First config\n        config2 (ConfigType): Second config\n\n    Returns:\n        bool: True if the model family is the same, False otherwise\n    \"\"\"\n\n    # check if both are an hugging face models\n    if (config1.model in hf_models) and (config2.model in hf_models):\n\n        # if there is a \"/\" remove it from the name\n        model_name1 = config1.model\n        model_name2 = config2.model\n        if \"/\" in model_name1:\n            model_name1 = model_name1.split(\"/\")[1]\n        if \"/\" in model_name2:\n            model_name2 = model_name2.split(\"/\")[1]\n\n        # check if the model family is the same\n        return model_name1.split(\"-\")[0] == model_name2.split(\"-\")[0]\n\n    # check if both are not an hugging face models\n    elif (config1.model not in hf_models) and (config2.model not in hf_models):\n\n        # for now they could be only LLaMA models\n        return True\n    else:\n        return False\n\n\nclass ActorCritic(torch.nn.Module):\n    \"\"\"Actor Critic class stores both the actor and the critic models\n    and it generates values and action for given sequences during the training\n    of the actor.\n\n    Attributes:\n        actor (ActorModel): Actor model\n        critic (CriticModel): Critic model\n        debug (bool): enable prints for Debugging\n        use_same_tokenizer (bool): if True the actor and critic use the same\n            tokenizer\n\n    Methods:\n        forward: given a sequence returns action logits and values (used\n            to evaluate the actor during training)\n        generate: given a sequence returns action, action logits, values\n            sequences and sequences masks (used to generate new sequences\n            during acting phase)\n    \"\"\"\n\n    def __init__(self, config: Config) -> None:\n        super().__init__()\n        self.config = config\n\n        self.actor = ActorModel(config.actor)\n\n        # check if critic must be initialized from reward model\n        ModelLoader.init_critic_from_reward(config.critic)\n        self.critic = CriticModel(config.critic)\n\n        # if the actor and critic use the same tokenizer is set to True\n        self.use_same_tokenizer = False\n\n        # debug flag\n        self.debug = config.actor.debug\n\n    @beartype\n    def load(self) -> None:\n        \"\"\"Load the model from the path.\n        This method is not implemented since it relies on actor and critic\n        __init__ methods to perform the loading from their respective paths\n        then loaded.\n\n        \"\"\"\n        pass\n\n    @beartype\n    def save(self) -> None:\n        \"\"\"Save the model to the path\n        This method is implemented to save the actor model as result of RLHF\n        in the folder actor_rl instead of actor.save() method that saves it\n        in the actor folder.\n        \"\"\"\n        # get the path to save the actor\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=self.config,\n            is_checkpoint=False,\n        )\n\n        # save the model\n        print(f\"Saving model to {path} ...\")\n        torch.save(\n            {\"state_dict\": self.actor.model.state_dict()},\n            path,\n        )\n\n        # get the path to save the critic model\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=self.config.critic,\n            is_checkpoint=False,\n        )\n\n        # save the model\n        print(f\"Saving model to {path} ...\")\n        torch.save(\n            {\n                \"model\": self.critic.model.state_dict(),\n                \"head\": self.critic.head.state_dict(),\n            },\n            path,\n        )\n\n    def save_deepspeed(\n        self,\n        model_engine: DeepSpeedEngine,\n        config: ConfigType,\n        client_state: dict = None,\n    ):\n        \"\"\"Save the deepspeed model_engine to the path\n        This method is implemented to save the actor model as result of RLHF\n        in the folder actor_rl instead of actor.save() method that saves it\n        in the actor folder. Same goes for the critic model.\n        \"\"\"\n        # get the path to save the actor\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=config,\n            is_checkpoint=False,\n        )\n\n        # save the model\n        print(f\"Saving model to {path} ...\")\n        model_engine.save_checkpoint(\n            save_dir=path, client_state=client_state if client_state else {}\n        )\n\n    @beartype\n    def forward(\n        self,\n        sequences_actor: torch.Tensor,\n        sequences_mask_actor: torch.Tensor,\n        sequences_critic: torch.Tensor,\n        sequences_mask_critic: torch.Tensor,\n        action_len_actor: int,\n        action_len_critic: int,\n    ) -> Tuple:\n        \"\"\"Given the whole sequences, use the actor forward to get the logits\n            for each token in the sequence and the critic forward to get the\n            values for each generation step.\n\n        Args:\n            sequences_actor (torch.Tensor): Sequences composed of\n                [states, actions] for the actor\n            sequence_mask_actor (torch.Tensor): Mask for the sequences\n                of the actor\n            sequences_critic (torch.Tensor): Sequences composed of\n                [states, actions] for the critic\n            sequences_mask_critic (torch.Tensor): Mask for the sequences\n                of the critic\n            action_len_actor (int): Length of the actions in the sequences\n                for the actor\n            action_len_critic (int): Length of the actions in the sequences\n                for the critic\n\n        Returns:\n            action_logits (torch.Tensor): Logits for the actions in the\n                sequences\n            values (torch.Tensor): Values for the actions in the sequences\n        \"\"\"\n\n        # use a single forward on the whole sequence\n        # to get pi(y | x) and ignore predicted output\n        actions_logits = self.actor.forward(\n            sequences_actor, sequences_mask_actor\n        )\n\n        # use the critic forward to get the values for the actions\n        values = self.critic.forward(sequences_critic, sequences_mask_critic)\n\n        # return only logits and values for the actions taken\n        real_actions_logits = actions_logits[:, -action_len_actor:, :]\n        real_values = values[:, -action_len_critic:]\n\n        if self.debug:\n            print(\"ActorCritic.forward\")\n            print(\"action_len_actor\", action_len_actor)\n            print(\"action_len_critic\", action_len_critic)\n            print(\"sequences_actor.shape\", sequences_actor.shape)\n            print(\"sequences_actor\", sequences_actor)\n            print(\"sequences_critic.shape\", sequences_critic.shape)\n            print(\"sequences_critic\", sequences_critic)\n            print(\"real_action_logits.shape\", actions_logits.shape)\n            print(\"real_action_logits\", actions_logits)\n            print(\"real_values.shape\", values.shape)\n            print(\"real_values\", values)\n\n        return (\n            real_actions_logits,\n            real_values,\n        )\n\n    @torch.no_grad()\n    @beartype\n    def generate(\n        self,\n        states_actor: torch.Tensor,\n        states_mask_actor: torch.Tensor,\n        states_critic: torch.Tensor,\n    ) -> Tuple:\n        \"\"\"Generate actions, actions_logits, values and sequences from states\n\n        Args:\n            states_actor (torch.Tensor): States for the actor\n            states_mask_actor (torch.Tensor): Mask for the states for the\n                actor\n            states_critic (torch.Tensor): States for the critic\n\n        Returns:\n            actions (torch.Tensor): Actions generated from the states\n            actions_logits (torch.Tensor): Logits for the actions generated\n                from the states (i.e. pi(y | x))\n            values (torch.Tensor): Values generated by the critic model\n                for the actions generated by the actor (i.e. V(x))\n            sequences (torch.Tensor): Sequences generated from the states\n                as [states, actions]\n        \"\"\"\n\n        # generate action sequence from the actor\n        actions, sequences_actor = self.actor.generate(\n            states_actor, states_mask_actor\n        )\n\n        # create mask for the actor sequences\n        sequences_mask_actor = (\n            (sequences_actor != self.actor.tokenizer.pad_token_id)\n            .to(sequences_actor.device)\n            .long()\n            .detach()\n        )\n\n        # get the length of the actions\n        action_len_actor = actions.shape[1]\n\n        # check if different encoding is needed for the critic\n        if self.use_same_tokenizer:\n            sequences_critic = sequences_actor\n            sequences_mask_critic = sequences_mask_actor\n            action_len_critic = action_len_actor\n        else:\n            encoded_critic = change_tokenization(\n                sequences_actor,\n                self.actor.tokenizer,\n                self.critic.tokenizer,\n            )\n            # split the encoded_critic in tokens and maks\n            sequences_critic = encoded_critic[\"input_ids\"].to(\n                sequences_actor.device,\n            )\n            sequences_mask_critic = (\n                encoded_critic[\"attention_mask\"]\n                .to(sequences_actor.device)\n                .long()\n                .detach()\n            )\n\n            # compute len of actions for the critic tokenizer\n            action_len_critic = states_critic.shape[1]\n\n        # generate actions_logits and values\n        actions_logits, values = self.forward(\n            sequences_actor,\n            sequences_mask_actor,\n            sequences_critic,\n            sequences_mask_critic,\n            action_len_actor,\n            action_len_critic,\n        )\n        if self.debug:\n            print(\"ActorCritic.generate\")\n            print(\"actions shape\", actions.shape)\n            print(\"actions\", actions)\n            print(\"sequence shape\", sequences_actor.shape)\n            print(\"sequence\", sequences_actor)\n            print(\"actions_logits shape\", actions_logits.shape)\n            print(\"actions_logits\", actions_logits)\n            print(\"values shape\", values.shape)\n            print(\"values\", values)\n\n        return (\n            actions,\n            actions_logits,\n            values,\n            sequences_actor,\n            sequences_mask_actor,\n            sequences_critic,\n            sequences_mask_critic,\n            action_len_actor,\n            action_len_critic,\n        )\n\n\n# structure to store the data for each experience\nMemory = namedtuple(\n    \"Memory\",\n    [\n        \"states_actor\",\n        \"actions\",\n        \"values\",\n        \"rewards\",\n        \"actions_log_probs\",\n        \"sequences_actor\",\n        \"sequences_mask_actor\",\n        \"sequences_critic\",\n        \"sequences_mask_critic\",\n        \"action_len_actor\",\n        \"action_len_critic\",\n    ],\n)\n\n\nclass ExperienceDataset(Dataset):\n    \"\"\"Dataset to train the actor-critic models\"\"\"\n\n    def __init__(\n        self,\n        memories: Deque[Memory],\n        device: torch.device,\n    ) -> None:\n        super().__init__()\n        self.data = list(memories)\n\n    def __len__(\n        self,\n    ) -> int:\n        return len(self.data)\n\n    def __getitem__(self, idx) -> Tuple:\n        # return the idx-th memory element as a tuple of tensors on the device\n        item = (\n            self.data[idx].states_actor,\n            self.data[idx].actions,\n            self.data[idx].values,\n            self.data[idx].rewards,\n            self.data[idx].actions_log_probs,\n            self.data[idx].sequences_actor,\n            self.data[idx].sequences_mask_actor,\n            self.data[idx].sequences_critic,\n            self.data[idx].sequences_mask_critic,\n            int(self.data[idx].action_len_actor),\n            int(self.data[idx].action_len_critic),\n        )\n        return item\n\n\nclass ExamplesSampler:\n    \"\"\"Store the prompt to be sampled to generate the examples\n    read a json file with the following format:\n    [\n        {\n            \"user_input\" : \"\",\n        } ,\n        ...\n    ]\n    Where:\n        user_input: is the input of the user or directly the input of the user\n            with the memory preappended (i.e. user_input + memory)\n    \"\"\"\n\n    def __init__(\n        self,\n        path: str,\n    ) -> None:\n        self.path = path\n        with open(path, \"r\") as f:\n            data = json.load(f)\n        self.data = [d[\"user_input\"] for d in data]\n\n    def sample(self, n: int) -> List:\n        \"\"\"Sample n examples from the data\n\n        Args:\n            n (int): Number of examples to sample\n        \"\"\"\n        return random.sample(self.data, n)\n\n\nclass RLTrainer:\n    \"\"\"Train the actor-critic model using RL\n\n    Attributes:\n        config (Config): Configuration of the trainer\n        debug (bool): Debug mode\n        actorcritic (ActorCritic): Actor-critic model\n        actor_optim (torch.optim): Optimizer for the actor\n        critic_optim (torch.optim): Optimizer for the critic\n        actor_scheduler (torch.optim.lr_scheduler): Scheduler for the actor\n        critic_scheduler (torch.optim.lr_scheduler): Scheduler for the critic\n        reward (RewardModel): Reward model\n        training_stats (TrainingStats): Class to store training stats\n        conversation_log (ConversationLog): Class to store the conversation\n        examples_sampler (ExamplesSampler): Class to sample examples\n        eps (float): small epsilon to avoid division by zero\n\n    Methods:\n        train: the training loop that calls the learn function after generating\n            the experiences.\n        learn: Learn from a batch of experiences and update the actor and the\n            critic model.\n        load_checkpoint: Load the checkpoint of the actor-critic model\n        save_checkpoint: Save the checkpoint of the actor-critic model\n    \"\"\"\n\n    def __init__(\n        self,\n        config: Config,\n    ) -> None:\n\n        # save config\n        self.config = config\n\n        # set debug mode\n        self.debug = config.trainer.debug\n\n        # initialize agent-critic\n        self.actorcritic = ActorCritic(config)\n\n        # initialize actor optimizer\n        self.actor_optimizer = torch.optim.Adam(\n            self.actorcritic.actor.parameters(), lr=config.trainer.actor_lr\n        )\n\n        # initialize critic optimizer\n        self.critic_optimizer = torch.optim.Adam(\n            self.actorcritic.critic.parameters(), lr=config.trainer.critic_lr\n        )\n\n        # scheduler (defined in the learn() method (i need dataset size))\n        self.actor_scheduler = None\n        self.critic_scheduler = None\n\n        # initialize reward model\n        self.reward = RewardModel(config.reward)\n\n        # initialize class to store training stats\n        path = ModelLoader.get_training_stats_path(config)\n        self.training_stats = TrainingStats(path)\n        model_folder, _, _ = ModelLoader.get_model_path(\n            config,\n            is_checkpoint=True,\n        )\n        path = os.path.join(model_folder, \"conversations_log.json\")\n        self.conversation_log = ConversationLog(path)\n\n        # initialize examples sampler\n        self.example_sampler = ExamplesSampler(config.trainer.examples_path)\n\n        # check if actor and critic use the same tokenizer\n        self.actorcritic.use_same_tokenizer = check_model_family(\n            config.actor, config.critic\n        )\n\n        # check if actor and reward use the same tokenizer\n        self.use_same_tokenizer = check_model_family(\n            config.actor, config.reward\n        )\n\n        # eps\n        self.eps = 1e-8\n\n        # deepspeed initialization\n        self.actor_model_engine = None\n        self.critic_model_engine = None\n        self.is_deepspeed_init = None\n\n        if (\n            self.config.actor.deepspeed_enable\n            or self.config.critic.deepspeed_enable\n            or self.config.critic.deepspeed_enable\n        ):\n            deepspeed.init_distributed(\"nccl\")\n            self.is_deepspeed_init = True\n            os.environ[\"TOKENIZERS_PARALLELISM\"] = \"False\"\n\n        else:\n            self.is_deepspeed_init = False\n\n        if self.config.actor.deepspeed_enable:\n            (\n                self.actor_model_engine,\n                self.actorcritic.actor,\n                self.actor_optimizer,\n            ) = self.initialize_deepspeed_model(\n                config=self.config.actor, model=self.actorcritic.actor\n            )\n\n        if self.config.critic.deepspeed_enable:\n            (\n                self.critic_model_engine,\n                self.actorcritic.critic,\n                self.critic_optimizer,\n            ) = self.initialize_deepspeed_model(\n                config=self.config.critic, model=self.actorcritic.critic\n            )\n\n        if self.config.reward.deepspeed_enable:\n            (\n                _,\n                self.reward,\n                _,\n            ) = self.initialize_deepspeed_model(\n                config=self.config.reward, model=self.reward\n            )\n\n    @staticmethod\n    def initialize_deepspeed_model(\n            config: Union[ConfigActor, ConfigCritic, ConfigReward],\n            model: torch.nn.Module,\n    ):\n\n        if config.deepspeed_config_path is None:\n            raise ValueError(\"DeepSpeed config path is None, but deepspeed is enabled\")\n        if os.path.exists(config.deepspeed_config_path) is False:\n            raise ValueError(\n                f\"DeepSpeed config path\"\n                f\"{config.deepspeed_config_path}\"\n                f\"does not exist\"\n            )\n        (model_engine, ds_optimizer, _, _,) = deepspeed.initialize(\n            args=None,\n            model=model,\n            model_parameters=model.parameters(),\n            config=config.deepspeed_config_path,\n        )\n        # model_engine.module has to be returned to make custom methods\n        # of Module accessible\n        return model_engine, model_engine.module, ds_optimizer\n\n    @beartype\n    def save_checkpoint(\n        self,\n        current_episode: int,\n        max_episode: int,\n    ) -> None:\n\n        print(f\"Saving checkpoint for episode {current_episode+1}..\")\n\n        # get the path to save the checkpoint for the critic\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=self.config.critic,\n            is_checkpoint=True,\n            current_epoch=current_episode,\n            max_epochs=max_episode,\n            max_steps=0,\n        )\n\n        # if the checkpoint already exists remove it.\n        # Deepspeed checkpoints are already directories and will be overwritten\n        if os.path.exists(path) and not self.is_deepspeed_init:\n            os.remove(path)\n\n        # save the checkpoint\n        actor_checkpoint_dict = {\n            \"episode\": current_episode,\n            \"critic_state_dict\": self.actorcritic.critic.state_dict(),\n            \"critic_optim_state_dict\": self.critic_optimizer.state_dict(),\n        }\n\n        if self.config.actor.deepspeed_enable:\n            # The model and optimizer state dicts are actually already saved\n            # In the deepspeed model engine. But to make sure no depending\n            # methods fail, the states are included in actor_checkpoint_dict.\n            # ATTENTION: If you use deepspeed zero optimization, the client_state\n            # will not be saved\n            self.actor_model_engine.save_checkpoint(\n                save_dir=path, client_state=actor_checkpoint_dict\n            )\n        else:\n            torch.save(actor_checkpoint_dict, path)\n\n        # get the path to save the checkpoint for the actor\n        model_folder, model_name, path = ModelLoader.get_model_path(\n            config=self.config,\n            is_checkpoint=True,\n            current_epoch=current_episode,\n            max_epochs=max_episode,\n            max_steps=0,\n        )\n\n        # if the checkpoint already exists remove it.\n        # Deepspeed checkpoints are already directories and will be overwritten\n        if os.path.exists(path) and not self.is_deepspeed_init:\n            os.remove(path)\n\n        # save the checkpoint\n        critic_checkpoint_dict = {\n            \"episode\": current_episode,\n            \"actor_state_dict\": self.actorcritic.actor.state_dict(),\n            \"actor_optim_state_dict\": self.actor_optimizer.state_dict(),\n            \"training_stats\": self.training_stats,\n        }\n\n        if self.config.critic.deepspeed_enable:\n            # The model and optimizer state dicts are actually already saved\n            # In the deepspeed model engine. But to make sure no depending\n            # methods fail, the states are included in critic_checkpoint_dict.\n            # ATTENTION: If you use deepspeed zero optimization, the client_state\n            # will not be saved\n            self.critic_model_engine.save_checkpoint(\n                save_dir=path, client_state=critic_checkpoint_dict\n            )\n        else:\n            torch.save(critic_checkpoint_dict, path)\n\n    @beartype\n    def load_checkpoint(\n        self,\n    ) -> int:\n\n        critic_episode = -1\n        actor_episode = -1\n\n        # check if there are some checkpoint for the critic\n        print(\"Looking for checkpoints...\")\n        path = ModelLoader.check_model_path(\n            config=self.config.critic,\n            is_checkpoint=True,\n            current_epoch=None,\n        )\n\n        # if there are checkpoint\n        if path is not None:\n\n            # load the critic checkpoint\n            print(\"Loading ...\")\n            try:\n                checkpoint = torch.load(path)\n            except Exception:\n                print(\n                    \"Checkpoint of critic corrupted!\"\n                    \"Try to remove the last checkpoint.\"\n                    \"Now Starting from episode 0\"\n                )\n                return 0\n\n            # load checkpoint into model\n            critic_episode = checkpoint[\"episode\"]\n            self.actorcritic.critic.load_state_dict(\n                checkpoint[\"critic_state_dict\"]\n            )\n            self.critic_optimizer.load_state_dict(\n                checkpoint[\"critic_optim_state_dict\"]\n            )\n\n        # check if there are checkpoints for the actor\n        print(\"Looking for checkpoints...\")\n        path = ModelLoader.check_model_path(\n            config=self.config,\n            is_checkpoint=True,\n            current_epoch=None,\n        )\n\n        # if there are some checkpoints\n        if path is not None:\n\n            # load the actor checkpoint\n            print(\"Loading ...\")\n            try:\n                checkpoint = torch.load(path)\n            except Exception:\n                print(\n                    \"Checkpoint of actor corrupted!\"\n                    \"Try to remove the last checkpoint.\"\n                    \"Now Starting from episode 0\"\n                )\n                return 0\n\n            # load checkpoint into the model\n            actor_episode = checkpoint[\"episode\"]\n            self.actorcritic.actor.load_state_dict(\n                checkpoint[\"actor_state_dict\"]\n            )\n            self.actor_optimizer.load_state_dict(\n                checkpoint[\"actor_optim_state_dict\"]\n            )\n            self.training_stats = checkpoint[\"training_stats\"]\n\n        # check if there are some discrepancies between the checkpoints\n        if critic_episode == actor_episode:\n            # all ok start from next episode\n            return critic_episode + 1\n        else:\n            print(\n                f\"There are some discrepancies between the checkpoints\"\n                f\"of actor and critic \\nactor episode: {actor_episode}\"\n                f\"\\n critic episode: {critic_episode}\\n\"\n            )\n            return min(critic_episode, actor_episode) + 1\n\n    @beartype\n    def learn(self, memories: Deque[Memory]) -> None:\n        \"\"\"Train the agent-critic model using RL:\n        - for each batch of episodes, compute action logits and values\n        - then compare action logits probs with memories one and values with\n            rewards to compute the PPO loss and update the actor-critic model\n        \"\"\"\n        print(\"Start to Learn...\")\n\n        # get parameters\n        epochs = self.config.trainer.epochs\n        actor_eps_clip = self.config.trainer.actor_eps_clip\n        critic_eps_clip = self.config.trainer.critic_eps_clip\n        beta_s = self.config.trainer.beta_s\n        batch_size = self.config.trainer.batch_size\n        device = (\n            torch.device(f\"cuda:{dist.get_rank()}\")\n            if self.is_deepspeed_init\n            else self.config.trainer.device\n        )\n\n        # create dataset from memories\n        dataset = ExperienceDataset(memories, device)\n        if self.is_deepspeed_init:\n            engine = self.actor_model_engine or self.critic_model_engine\n            dataloader = engine.deepspeed_io(dataset)\n        else:\n            dataloader = DataLoader(dataset, batch_size=batch_size)\n\n        # initialize scheduler for actor\n        actor_lr = self.config.trainer.actor_lr\n        # This lr_scheduler is not available in deepspeed\n        # see https://deepspeed.readthedocs.io/en/latest/schedulers.html\n        if not self.is_deepspeed_init:\n            self.actor_scheduler = CosineAnnealingWarmRestarts(\n                self.actor_optimizer, T_0=len(dataset), eta_min=actor_lr * 0.1\n            )\n\n        # initialize scheduler for critic\n        critic_lr = self.config.trainer.critic_lr\n        # This lr_scheduler is not available in deepspeed\n        # see https://deepspeed.readthedocs.io/en/latest/schedulers.html\n        if not self.is_deepspeed_init:\n            self.critic_scheduler = CosineAnnealingWarmRestarts(\n                self.critic_optimizer, T_0=len(dataset), eta_min=critic_lr * 0.1\n            )\n\n        # initialize actor accelerate\n        if self.config.actor.accelerate_enable is True:\n            actor_accelerator = Accelerator()\n            (\n                actor_model,\n                self.actor_optimizer,\n                self.train_dataloader,\n                self.actor_scheduler,\n            ) = actor_accelerator.prepare(\n                self.actorcritic.actor,\n                self.actor_optimizer,\n                self.train_dataloader,\n                self.actor_scheduler,\n            )\n            self.actorcritic.actor = actor_model\n\n        # initialize critic accelerate\n        if self.config.critic.accelerate_enable is True:\n            critic_accelerator = Accelerator()\n            (\n                critic_model,\n                self.critic_optimizer,\n                self.critic_scheduler,\n            ) = critic_accelerator.prepare(\n                self.actorcritic.critic,\n                self.critic_optimizer,\n                self.critic_scheduler,\n            )\n            self.actorcritic.critic = critic_model\n\n        # train agent-critic\n        self.actorcritic.train()\n        for epoch in range(epochs):\n            for k, batch in enumerate(dataloader):\n\n                (\n                    states_actor,\n                    old_actions,\n                    old_values,\n                    rewards,\n                    old_actions_log_probs,\n                    sequences_actor,\n                    sequences_mask_actor,\n                    sequences_critic,\n                    sequences_mask_critic,\n                    action_len_actor,\n                    action_len_critic,\n                ) = [tensor.to(device) for tensor in batch]\n\n                if self.debug:\n                    print(\n                        f\"#########################################\"\n                        f\" batch from memories {k} \\n \"\n                        f\"#########################################\"\n                        f\"states_actor {states_actor.shape} \\n\"\n                        f\"old_actions {old_actions.shape} \\n\"\n                        f\"old_values {old_values.shape} \\n\"\n                        f\"rewards {rewards.shape} \\n\"\n                        f\"old_actions_log_probs \"\n                        f\"{old_actions_log_probs.shape}\\n\"\n                        f\"sequences_actor {sequences_actor.shape} \\n\"\n                        f\"sequences_mask_actor \"\n                        f\"{sequences_mask_actor.shape} \\n\"\n                        f\"sequences_critic {sequences_critic.shape} \\n\"\n                        f\"sequences_mask_critic \"\n                        f\"{sequences_mask_critic.shape} \\n\"\n                        f\"action_len_actor {action_len_actor} \\n\"\n                        f\"action_len_critic {action_len_critic} \\n\"\n                        f\"#########################################\"\n                    )\n\n                # get actor critic new probabilities and values\n                actions_logits, values = self.actorcritic.forward(\n                    sequences_actor,\n                    sequences_mask_actor,\n                    sequences_critic,\n                    sequences_mask_critic,\n                    action_len_actor.item(),\n                    action_len_critic.item(),\n                )\n\n                # get action log prob\n                actions_prob = (\n                    torch.softmax(actions_logits, dim=-1).max(dim=-1).values\n                )\n                actions_log_prob = torch.log(actions_prob + self.eps)\n\n                # compute entropy\n                entropies = (actions_prob * actions_log_prob).sum(dim=-1)\n\n                # compute KL divergence\n                kl_div_loss = (\n                    (actions_prob * (old_actions_log_probs - actions_log_prob))\n                    .sum(dim=-1)\n                    .mean()\n                )\n\n                # compute ratios\n                ratios = (actions_log_prob - old_actions_log_probs).exp()\n\n                # compute PPO loss\n                if check_model_family(self.config.actor, self.config.critic):\n                    # compute discounted rewards as in TRL\n                    gamma = self.config.trainer.gamma_discounted\n                    discounted_rewards = torch.zeros_like(old_values)\n                    for i in range(discounted_rewards.shape[1]):\n                        for j in range(i, discounted_rewards.shape[1]):\n                            discounted_rewards[:, i] += (\n                                gamma ** (j - i) * rewards[:, j]\n                            )\n\n                    advantages = (\n                        discounted_rewards - old_values\n                    )  # TRL has opposite sign for old values\n                    advantages = (advantages - advantages.mean(dim=-1)) / (\n                        advantages.std() + self.eps\n                    )\n\n                    surr1 = advantages * ratios\n                else:\n                    advantages = rewards - old_values[:, -1]\n                    surr1 = advantages * ratios\n\n                surr2 = (\n                    torch.clamp(ratios, 1 - actor_eps_clip, 1 + actor_eps_clip)\n                    * advantages\n                )\n\n                policy_loss = -torch.min(surr1, surr2) - beta_s * entropies\n                policy_loss = policy_loss.mean()\n                loss = policy_loss + kl_div_loss\n\n                # check if loss item is NaN\n                if torch.isnan(loss):\n                    raise ValueError(\"Loss is nan\")\n\n                # update actor with loss\n                if self.config.actor.deepspeed_enable:\n                    self.actor_model_engine.backward(loss)\n                    self.actor_model_engine.step()\n                elif self.config.actor.accelerate_enable:\n                    self.actor_optimizer.zero_grad()\n                    actor_accelerator.backward(loss)\n                    self.actor_optimizer.step()\n                    self.actor_scheduler.step()\n                else:\n                    self.actor_optimizer.zero_grad()\n                    loss.backward()\n                    self.actor_optimizer.step()\n                    self.actor_scheduler.step()\n\n                # compute value loss\n                # the loss is the distance between the rewards and the values\n                # I want this distance to be small so that values are\n                # representative of the rewards, for this reason i took the\n                # maximum between the two.\n                # The clip is limiting the slew-rate of values_loss_clipped\n                value_loss_clipped = old_values + (values - old_values).clamp(\n                    -critic_eps_clip, critic_eps_clip\n                )\n                value_loss1 = (value_loss_clipped - rewards) ** 2\n                value_loss2 = (values - rewards) ** 2\n                value_loss = torch.max(value_loss1, value_loss2).mean()\n\n                if torch.isnan(value_loss):\n                    raise ValueError(\"Value loss is nan\")\n\n                # upate critic\n                if self.config.critic.deepspeed_enable:\n                    self.critic_model_engine.backward(value_loss)\n                    self.critic_model_engine.step()\n                elif self.config.critic.accelerate_enable:\n                    self.critic_optimizer.zero_grad()\n                    critic_accelerator.backward(loss)\n                    self.critic_optimizer.step()\n                    self.critic_scheduler.step()\n                else:\n                    self.critic_optimizer.zero_grad()\n                    value_loss.backward()\n                    self.critic_optimizer.step()\n                    self.critic_scheduler.step()\n\n                # append the losses to the training stats\n                self.training_stats.training_loss.append(\n                    loss.detach().cpu().item()\n                )\n                self.training_stats.value_loss.append(\n                    value_loss.detach().cpu().item()\n                )\n\n                # print iteration info\n                print(\n                    f\"Epoch {epoch+1}/{epochs}\",\n                    f\"Step {k+1}/{int(len(dataloader) / batch_size)}\",\n                    f\"Loss {loss.detach().cpu().item():.4f}\",\n                    f\"Value Loss {value_loss.detach().cpu().item():.4f}\",\n                )\n\n        self.actorcritic.eval()\n        print(\"End Learning\")\n\n    def train(\n        self,\n    ) -> None:\n\n        print(\"Start RL Training\")\n\n        # initialize settings\n        num_episodes = self.config.trainer.num_episodes\n        max_timesteps = self.config.trainer.max_timesteps\n        num_examples = self.config.trainer.num_examples\n        update_timesteps = self.config.trainer.update_timesteps\n        batch_size = self.config.trainer.batch_size\n        checkpoint_steps = self.config.trainer.checkpoint_steps\n        device = (\n            torch.device(f\"cuda:{dist.get_rank()}\")\n            if self.is_deepspeed_init\n            else self.config.trainer.device\n        )\n\n        # number of elements that the memories should contain when learning\n        number_of_memories_per_learn_iteration = (\n            num_examples * update_timesteps\n        )\n\n        # the number of memories must be a multiple of the batch size\n        assert (\n            number_of_memories_per_learn_iteration % batch_size == 0\n        ), \"The number of memories must be a multiple of the batch size\"\n\n        # the total number of timesteps done in the train() are\n        total_number_of_timesteps = num_episodes * max_timesteps\n\n        # the total timesteps done should be a multiple of the update timesteps\n        assert total_number_of_timesteps % update_timesteps == 0, (\n            \"The number of timesteps (num_episodes*max_timesteps)\"\n            \"must be a multiple of the update_timesteps\"\n        )\n\n        # initialize memories\n        memories = deque([])\n\n        # load checkpoint\n        start_episode = self.load_checkpoint()\n\n        # if it is a new training from the start clear the conversation log\n        if start_episode == 0:\n            self.conversation_log.clear()\n\n        # initialize counters\n        cnt_timesteps = 0\n        cnt_learn_iter = 0\n\n        # loop over episodes and timesteps\n        self.actorcritic.eval()\n        for episode in range(start_episode, num_episodes):\n            for timestep in range(max_timesteps):\n\n                # print the iteration info\n                print(\n                    f\"Episode: {episode + 1}/{num_episodes}, \"\n                    f\"Timestep: {timestep + 1}/{max_timesteps}\",\n                    f\"Learning Cnt: {cnt_timesteps + 1}/{update_timesteps}\",\n                )\n\n                # counter used to count timesteps into memory\n                cnt_timesteps += 1\n\n                # sample num_examples examples from  example dataset\n                inputs = self.example_sampler.sample(num_examples)\n\n                # tokenize examples for the actor\n                tok_inputs_act = self.actorcritic.actor.tokenizer(\n                    inputs, padding=True, return_tensors=\"pt\", truncation=True\n                )\n\n                # states are [batch_size, seq_len_of_states]\n                states_actor = tok_inputs_act[\"input_ids\"].to(device)\n                states_mask_actor = tok_inputs_act[\"attention_mask\"].to(device)\n\n                # tokenize examples for the critic\n                tok_inputs_crt = self.actorcritic.critic.tokenizer(\n                    inputs, padding=True, return_tensors=\"pt\", truncation=True\n                )\n\n                # states are [batch_size, seq_len_of_states]\n                states_critic = tok_inputs_crt[\"input_ids\"].to(device)\n\n                # generate sequences of actions and values\n                (\n                    actions,\n                    actions_logits,\n                    values,\n                    sequences_actor,\n                    sequences_mask_actor,\n                    sequences_critic,\n                    sequences_mask_critic,\n                    action_len_actor,\n                    action_len_critic,\n                ) = self.actorcritic.generate(\n                    states_actor, states_mask_actor, states_critic\n                )\n\n                # compute action log probs\n                action_prob = (\n                    torch.softmax(actions_logits, dim=-1).max(dim=-1).values\n                )\n                actions_log_probs = torch.log(action_prob + self.eps)\n\n                # get tokenized sequence for the reward models\n                if self.use_same_tokenizer:\n                    reward_sequence = sequences_actor\n                    reward_mask = sequences_mask_actor\n                elif check_model_family(\n                    self.config.critic, self.config.reward\n                ):\n                    reward_sequence = sequences_critic\n                    reward_mask = sequences_mask_critic\n                else:\n                    tokenized_responses = change_tokenization(\n                        sequences_actor,\n                        self.actorcritic.actor.tokenizer,\n                        self.reward.tokenizer,\n                    )\n                    # get tokens and mask\n                    reward_sequence = tokenized_responses[\"input_ids\"].to(\n                        device\n                    )\n                    reward_mask = tokenized_responses[\"attention_mask\"].to(\n                        device\n                    )\n\n                # compute rewards\n                rewards = self.reward.forward(\n                    reward_sequence,\n                    reward_mask,\n                )\n\n                rewards = rewards[:, -action_len_critic:]\n                reward = rewards[:, -1]\n\n                # store memories of the episode / timestep\n                for i in range(states_actor.shape[0]):\n                    memories.append(\n                        Memory(\n                            states_actor[i, :].detach().cpu(),\n                            actions[i, :].detach().cpu(),\n                            values[i, :].detach().cpu(),\n                            rewards[i, :].detach().cpu(),\n                            actions_log_probs[i, :].detach().cpu(),\n                            sequences_actor[i, :].detach().cpu(),\n                            sequences_mask_actor[i, :].detach().cpu(),\n                            sequences_critic[i, :].detach().cpu(),\n                            sequences_mask_critic[i, :].detach().cpu(),\n                            int(action_len_actor),\n                            int(action_len_critic),\n                        )\n                    )\n\n                # decode completions to be logged in the conversation log\n                completions = [\n                    self.actorcritic.actor.tokenizer.decode(action)\n                    for action in actions\n                ]\n                # remove pad tokens from completions\n                completions = [\n                    c.replace(self.actorcritic.actor.tokenizer.pad_token, \"\")\n                    for c in completions\n                ]\n                # remove eos tokens from completions\n                completions = [\n                    c.replace(self.actorcritic.actor.tokenizer.eos_token, \"\")\n                    for c in completions\n                ]\n                # strange i need to force this?\n                completions = [c.replace(\"<pad>\", \"\") for c in completions]\n\n                # log the memories in the conversation log\n                for i in range(states_actor.shape[0]):\n                    self.conversation_log.append(\n                        inputs[i],\n                        completions[i],\n                        reward[i].detach().cpu().item(),\n                        cnt_learn_iter,\n                    )\n\n                # learn from memories\n                if (cnt_timesteps % update_timesteps == 0) and (\n                    cnt_timesteps != 0\n                ):\n                    print(\"len memories\", len(memories))\n                    if not self.is_deepspeed_init or (dist.get_rank() == 0):\n                        self.conversation_log.save()\n                    self.learn(memories)\n                    mean_reward = sum([m.rewards[-1] for m in memories]) / len(\n                        memories\n                    )\n                    print(f\"Mean Reward: {mean_reward}\")\n                    memories.clear()\n                    cnt_timesteps = 0\n                    cnt_learn_iter += 1\n                    if not self.is_deepspeed_init or (dist.get_rank() == 0):\n                        self.conversation_log.save()\n\n            # save checkpoints\n            if (episode % checkpoint_steps == 0) and (episode != 0):\n                self.save_checkpoint(\n                    current_episode=episode, max_episode=num_episodes\n                )\n                if not self.is_deepspeed_init or (dist.get_rank() == 0):\n                    self.conversation_log.save()\n\n        # save the models\n        if self.is_deepspeed_init:\n            self.actorcritic.save_deepspeed(self.actor_model_engine, self.config)\n            self.actorcritic.save_deepspeed(\n                self.critic_model_engine, self.config.critic\n            )\n        else:\n            self.actorcritic.save()\n        print(\"End RL Training\")\n"
  },
  {
    "path": "optimization/chatllama/chatllama/rlhf/utils.py",
    "content": "import json\nimport os\nfrom beartype import beartype\nfrom plotly import graph_objects as go\n\n\nclass TrainingStats:\n    \"\"\"Training statistics\n\n    Attributes:\n        training_loss (List): List of training losses\n        training_accuracy (List): List of training accuracies\n        value_loss (List): List of value losses\n        validation_loss (List): List of validation losses\n        validation_accuracy (List): List of validation accuracies\n    \"\"\"\n\n    def __init__(self, path: str):\n        \"\"\"Initialize the training stats\n\n        Args:\n            path (str): Path to save the stats\n        \"\"\"\n        self.training_loss = []\n        self.training_accuracy = []\n        self.value_loss = []\n        self.validation_loss = []\n        self.validation_accuracy = []\n        self.path = path\n\n    def plot(self):\n        \"\"\"Plot the training statistics using plotly\"\"\"\n        fig = go.Figure()\n        if len(self.training_loss) > 0:\n            fig.add_trace(\n                go.Scatter(y=self.training_loss, name=\"Training loss\")\n            )\n        if len(self.training_accuracy) > 0:\n            fig.add_trace(\n                go.Scatter(y=self.training_accuracy, name=\"Training accuracy\")\n            )\n        if len(self.value_loss) > 0:\n            fig.add_trace(go.Scatter(y=self.value_loss, name=\"Value loss\"))\n        if len(self.validation_loss) > 0:\n            fig.add_trace(\n                go.Scatter(y=self.validation_loss, name=\"Validation loss\")\n            )\n        if len(self.validation_accuracy) > 0:\n            fig.add_trace(\n                go.Scatter(\n                    y=self.validation_accuracy, name=\"Validation accuracy\"\n                )\n            )\n        fig.update_layout(\n            showlegend=True, xaxis_type=\"log\", xaxis_title=\"steps\"\n        )\n        fig.show()\n\n    def save(\n        self,\n    ):\n        \"\"\"Save the stats\"\"\"\n        if os.path.exists(self.path):\n            with open(self.path, \"r\") as f:\n                stats = json.load(f)\n            stats[\"training_loss\"].extend(self.training_loss)\n            stats[\"training_accuracy\"].extend(self.training_accuracy)\n            stats[\"value_loss\"].extend(self.value_loss)\n            stats[\"validation_loss\"].extend(self.validation_loss)\n            stats[\"validation_accuracy\"].extend(self.validation_accuracy)\n        else:\n            stats = {\n                \"training_loss\": self.training_loss,\n                \"training_accuracy\": self.training_accuracy,\n                \"value_loss\": self.value_loss,\n                \"validation_loss\": self.validation_loss,\n                \"validation_accuracy\": self.validation_accuracy,\n            }\n        with open(self.path, \"w\") as f:\n            json.dump(stats, f, indent=4)\n\n    def load(\n        self,\n    ):\n        \"\"\"Load the stats\"\"\"\n        with open(self.path, \"r\") as f:\n            stats = json.load(f)\n        self.training_loss = stats[\"training_loss\"]\n        self.training_accuracy = stats[\"training_accuracy\"]\n        self.value_loss = stats[\"value_loss\"]\n        self.validation_loss = stats[\"validation_loss\"]\n        self.validation_accuracy = stats[\"validation_accuracy\"]\n\n    def clear(\n        self,\n    ):\n        \"\"\"Clear the stats\"\"\"\n        self.training_loss = []\n        self.training_accuracy = []\n        self.value_loss = []\n        self.validation_loss = []\n        self.validation_accuracy = []\n        if os.path.exists(self.path):\n            os.remove(self.path)\n\n\nclass ConversationLog:\n    \"\"\"Save the conversation:\n    (user input, model output, rewards and learn_counter)\n    during the RL training loop.\n    \"\"\"\n\n    def __init__(self, path: str):\n        self.conversation = []\n        self.path = path\n        if self.path is None:\n            self.path = \"./convesation_log.json\"\n\n    @beartype\n    def append(\n        self,\n        user_input: str,\n        model_output: str,\n        reward: float,\n        learn_counter: int,\n    ):\n        \"\"\"Add a conversation to the log\n\n        Args:\n            user_input (str): User input / initial prompt\n            model_output (str): Completion of the LLM model\n            reward (float): Reward of the reward model assigned to the output\n            learn_counter (int): Number of the learning iteration to\n                distinguish the conversations that happens at different\n                points of the training loopt\n        \"\"\"\n        self.conversation.append(\n            {\n                \"user_input\": user_input,\n                \"model_output\": model_output,\n                \"reward\": reward,\n                \"learn_counter\": learn_counter,\n            }\n        )\n\n    def save(self):\n        print(\"Saving conversations log\")\n        if os.path.exists(self.path):\n            with open(self.path, \"r\") as f:\n                conversation = json.load(f)\n            self.conversation.extend(conversation)\n        self.conversation = sorted(\n            self.conversation, key=lambda x: float(x[\"learn_counter\"])\n        )\n        with open(self.path, \"w\") as f:\n            json.dump(self.conversation, f, indent=4)\n\n    def load(self):\n        with open(self.path, \"r\") as f:\n            self.conversation = json.load(f)\n\n    def clear(self):\n        print(\"Clearing conversations log\")\n        self.conversation = []\n        # remove the file in path exists\n        if os.path.exists(self.path):\n            os.remove(self.path)\n\n    def show(self, current_iteration: int = None):\n        \"\"\"Show the conversation log\n\n        Args:\n            current_iteration (int): Current iteration of the training loop,\n                if not None, print only the conversations that happened at\n                <current_iteration>\n        \"\"\"\n        for i, c in enumerate(self.conversation):\n            if current_iteration is None:\n                print(\n                    f\"##########################################\\n\"\n                    f\"Conversation {i} at learn_counter \"\n                    f\"{c['learn_counter']}\\n\"\n                    f\"##########################################\\n\"\n                    f\"## User Input:\\n\\n{c['user_input']}\\n\\n\"\n                    f\"## Model Output:\\n\\n{c['model_output']}\\n\\n\"\n                    f\"## Reward: {c['reward']}\\n\\n\"\n                )\n            else:\n                if current_iteration == c[\"learn_counter\"]:\n                    print(\n                        f\"##########################################\\n\"\n                        f\"Conversation {i} at learn_counter \"\n                        f\"{c['learn_counter']}\\n\"\n                        f\"##########################################\\n\"\n                        f\"## User Input:\\n\\n{c['user_input']}\\n\\n\"\n                        f\"## Model Output:\\n\\n{c['model_output']}\\n\\n\"\n                        f\"## Reward: {c['reward']}\\n\\n\"\n                    )\n"
  },
  {
    "path": "optimization/chatllama/setup.py",
    "content": "from pathlib import Path\nfrom setuptools import setup, find_packages\n\n\nREQUIREMENTS = [\n    \"accelerate\",\n    \"beartype\",\n    \"deepspeed\",\n    \"einops\",\n    \"fairscale\",\n    \"langchain>=0.0.103\",\n    \"torch\",\n    \"tqdm\",\n    \"transformers\",\n    \"datasets\",\n    \"openai\",\n    \"plotly\",\n    \"peft\"\n]\n\nthis_directory = Path(__file__).parent\nlong_description = (this_directory / \"README.md\").read_text(encoding=\"utf8\")\n\nsetup(\n    name=\"chatllama-py\",\n    version=\"0.0.4\",\n    packages=find_packages(),\n    install_requires=REQUIREMENTS,\n    long_description=long_description,\n    include_package_data=True,\n    long_description_content_type=\"text/markdown\",\n)\n"
  },
  {
    "path": "optimization/cloud_surfer/README.md",
    "content": "# 🏄 CloudSurfer (WIP)\nAutomatically discover the optimal cloud configuration and hardware on AWS, GCP and Azure to run your AI models.\n\nIf you like this module, give us a star to show your support for the project ⭐\n\n## 📚 Description\nThe CloudSurfer module allows users to automatically compare the inference performance of their deep learning model across hardware and cloud providers. It leverages state-of-the-art optimization techniques to custom-accelerate the models on each platform, providing the user with an accurate benchmark of their model performances in terms of speed, accuracy, and cost.\n\nWith CloudSurfer, users can input their model in their preferred deep learning framework and express their preferences for accuracy and performance. The library will then automatically test the model on a range of hardware and cloud platforms, using optimization techniques to ensure that the results are accurate and representative of the model's performances.\n\nUsers can then compare the results side-by-side, seeing the performance of their model on different hardware and cloud providers. This is key to make informed decisions about which platform (cloud and hardware type) to pick, without having to guess or rely on outdated information.\n\nOverall, CloudSurfer provides a powerful and easy-to-use tool to optimize deep learning models and to choose the best inference hardware and cloud platform. Try it out today, and reach out if you have any feedback!\n"
  },
  {
    "path": "optimization/forward_forward/README.md",
    "content": "# Forward-Forward Algorithm\n\nThis module implements a complete open-source version of [Geoffrey Hinton's Forward Forward](https://www.cs.toronto.edu/~hinton/FFA13.pdf) Algorithm, an alternative approach to backpropagation.\n\nThe Forward Forward algorithm is a method for training deep neural networks that replaces the backpropagation forward and backward passes with two forward passes, one with positive (i.e., real) data and the other with negative data that could be generated by the network itself.\n\nUnlike the backpropagation approach, Forward-Forward does not require calculating the gradient of the loss function with respect to the network parameters. Instead, each optimization step can be performed locally and the weights of each layer can be updated immediately after the layer has performed its forward pass.\n\nIf you appreciate the project, show it by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers)\n\n<img width=\"1012\" alt=\"Screenshot 2022-12-20 at 14 45 22\" src=\"https://user-images.githubusercontent.com/83510798/208681462-2d8fc8f8-b24e-41a3-978a-72101f7f6392.png\">\n\n## Installation\n\nThe forward-forward module is built on top of nebullvm, a framework for efficiency-based modules. The library can be easily installed from source code. First you have to clone the repository and navigate to the app directory:\n\n```bash\ngit clone https://github.com/nebuly-ai/nebullvm.git\ncd nebullvm/apps/accelerate/forward_forward\n```\n\nThen install the module:\n\n```bash\npip install .\n```\nThis process will just install the minimum requirements for running the module. If you want to run the module on a GPU you have to install the CUDA version of PyTorch. You can find the instructions on the official PyTorch website.\n\n## Usage\nAt the current stage, this implementation supports the main architectures discussed by Hinton in his paper. Each architecture can be trained with the following command:\n\n```python\nfrom forward_forward import train_with_forward_forward_algorithm\nimport os\nimport torch\n\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\ntrained_model = train_with_forward_forward_algorithm(\n    model_type=\"progressive\",\n    n_layers=3,\n    hidden_size=2000,\n    lr=0.03,\n    device=device,\n    epochs=100,\n    batch_size=5000,\n    theta=2.,\n)\n```\n\nThree architectures are currently supported:\n* `progressive`: the most simple architecture described in the paper. It has a pipeline-like structure and each layer can be trained independently from the following ones. Our implementation differs respect the original one since the labels are injected in the image concatenating them to the flattened tensor instead of replacing the first n_classes pixels value with a one-hot-representation of the label.\n\n* `recurrent`: the recurrent architecture described in the paper. It has a recurrent-like structure and its based on the `GLOM` architecture proposed by Hinton. \n\n* `nlp`: A simple network which can be used as a language model.\n\nThe recurrent and nlp network architectures are better explained below.\n\n## Recurrent Architecture\nThe recurrent architecture is based in the `GLOM` architecture for videos, proposed by Hinton in the paper [How to represent part-whole hierarchies in a neural network](https://arxiv.org/pdf/2102.12627.pdf). Its application to the forward-forward algorithm aims at enabling each layer to learn not just from the previous layer output, but from the following layers as well. This is done by concatenating the outputs of the previous layer and following layers computed at the previous time-step. A learned representation of the label (positive or negative) it is given as input to the last layer. The following figure shows the structure of the network:\n\n<p align=\"center\">\n    <img width=\"500\" alt=\"recurrent_net\" src=\"https://user-images.githubusercontent.com/38586138/208651417-498c4bd4-f2dc-4613-a376-0b69317c73d4.png\">\n</p>\n\n## NLP Architecture\nThe forward-forward architecture developed for NLP is a simple network which can be used as a language model. The network is composed by few normalized fully connected layers followed by a ReLU activation. All hidden representations are then concatenated together and given as input to the softmax for predicting the next token. The network can be trained in a progressive way, i.e. each layer can be sequentially trained separately from the following ones. The following figure shows the structure of the network:\n\n<p align=\"center\">\n    <img width=\"500\" class=\"center\" alt=\"nlp_net\" src=\"https://user-images.githubusercontent.com/38586138/208651624-c159b230-f903-4e13-aaa7-b39a0d1c52fc.png\">\n</p>\n\n## What is missing\nThis app implements the main architectures exposed by hinton in its paper. However, there are still some features that are not implemented yet. In particular, the following features are missing:\n\n* [ ] Implementation of unsupervised training.\n* [ ] Implementation of the `progressive` architecture using local receptive fields instead of fully connected layers.\n* [ ] Training on CIFAR-10 for CV-based architectures.\n\nAnd don't forget to [leave a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers) if you appreciate the project!\nIf you have any questions about the implementation, [open an issue](https://github.com/nebuly-ai/nebullvm/issues) or contact us in the [community chat](https://discord.gg/RbeQMu886J).\n\n## Contributing\n\nWe welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the linked page for more information on how to get involved.\n\nA special thanks to [Additi Pandey](https://github.com/cyclotomicextension) for her amazing contribution to the Forward-Forward module.\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/__init__.py",
    "content": "from forward_forward.api.functions import (  # noqa F401\n    train_with_forward_forward_algorithm,\n)\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/api/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/forward_forward/forward_forward/api/functions.py",
    "content": "from torchvision import datasets\n\nfrom forward_forward.root_op import (\n    ForwardForwardRootOp,\n    ForwardForwardModelType,\n)\n\n\ndef train_with_forward_forward_algorithm(\n    n_layers: int = 2,\n    model_type: str = \"progressive\",\n    device: str = \"cpu\",\n    hidden_size: int = 2000,\n    lr: float = 0.03,\n    epochs: int = 100,\n    batch_size: int = 5000,\n    theta: float = 2.0,\n    shuffle: bool = True,\n    **kwargs,\n):\n    model_type = ForwardForwardModelType(model_type)\n    root_op = ForwardForwardRootOp(model_type)\n\n    output_size = None\n    if model_type is ForwardForwardModelType.PROGRESSIVE:\n        input_size = 28 * 28 + len(datasets.MNIST.classes)\n    elif model_type is ForwardForwardModelType.RECURRENT:\n        input_size = 28 * 28\n        output_size = len(datasets.MNIST.classes)\n    else:  # model_type is ForwardForwardModelType.NLP\n        input_size = 10  # number of characters\n        output_size = 30  # length of vocabulary\n        assert (\n            kwargs.get(\"predicted_tokens\") is not None\n        ), \"predicted_tokens must be specified for NLP model\"\n\n    root_op.execute(\n        input_size=input_size,\n        n_layers=n_layers,\n        hidden_size=hidden_size,\n        optimizer_name=\"Adam\",\n        optimizer_params={\"lr\": lr},\n        loss_fn_name=\"alternative_loss_fn\",\n        batch_size=batch_size,\n        epochs=epochs,\n        device=device,\n        shuffle=shuffle,\n        theta=theta,\n        output_size=output_size,\n    )\n\n    return root_op.get_result()\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/app.py",
    "content": "from nebullvm.apps.base import App\n\nfrom forward_forward.root_op import ForwardForwardRootOp\n\n\nclass ForwardForwardApp(App):\n    def __init__(self):\n        super().__init__()\n        self.root_op = ForwardForwardRootOp()\n\n    def execute(self, *args, **kwargs):\n        return self.root_op.execute(*args, **kwargs)\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/operations/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/forward_forward/forward_forward/operations/build_models.py",
    "content": "from abc import ABC, abstractmethod\n\nimport torch\n\nfrom nebullvm.operations.base import Operation\n\nfrom forward_forward.utils.modules import (\n    FCNetFFProgressive,\n    RecurrentFCNetFF,\n    LMFFNet,\n)\n\n\nclass BaseModelBuildOperation(Operation, ABC):\n    def __init__(self):\n        super().__init__()\n        self.model = None\n\n    @abstractmethod\n    def execute(\n        self,\n        input_size: int,\n        n_layers: int,\n        hidden_size: int,\n        optimizer_name: str,\n        optimizer_params: dict,\n        loss_fn_name: str,\n        output_size: int = None,\n    ):\n        raise NotImplementedError\n\n    def get_result(self):\n        return self.model\n\n\nclass FCNetFFProgressiveBuildOperation(BaseModelBuildOperation):\n    def __init__(self):\n        super().__init__()\n\n    def execute(\n        self,\n        input_size: int,\n        n_layers: int,\n        hidden_size: int,\n        optimizer_name: str,\n        optimizer_params: dict,\n        loss_fn_name: str,\n        output_size: int = None,\n    ):\n        layer_sizes = [input_size] + [hidden_size] * n_layers\n        model = FCNetFFProgressive(\n            layer_sizes=layer_sizes,\n            optimizer_name=optimizer_name,\n            optimizer_kwargs=optimizer_params,\n            loss_fn_name=loss_fn_name,\n            epochs=-1,\n        )\n        if output_size is not None:\n            output_layer = torch.nn.Linear(layer_sizes[-1], output_size)\n            model = torch.nn.Sequential(model, output_layer)\n\n        self.model = model\n\n\nclass RecurrentFCNetFFBuildOperation(BaseModelBuildOperation):\n    def __init__(self):\n        super().__init__()\n\n    def execute(\n        self,\n        input_size: int,\n        n_layers: int,\n        hidden_size: int,\n        optimizer_name: str,\n        optimizer_params: dict,\n        loss_fn_name: str,\n        output_size: int = None,\n    ):\n        layer_sizes = [input_size] + [hidden_size] * n_layers + [output_size]\n        model = RecurrentFCNetFF(\n            layer_sizes=layer_sizes,\n            optimizer_name=optimizer_name,\n            optimizer_kwargs=optimizer_params,\n            loss_fn_name=loss_fn_name,\n        )\n        self.model = model\n\n\nclass LMFFNetBuildOperation(BaseModelBuildOperation):\n    def __init__(self):\n        super().__init__()\n\n    def execute(\n        self,\n        input_size: int,\n        n_layers: int,\n        hidden_size: int,\n        optimizer_name: str,\n        optimizer_params: dict,\n        loss_fn_name: str,\n        output_size: int = None,\n    ):\n        model = LMFFNet(\n            token_num=output_size,\n            hidden_size=hidden_size,\n            n_layers=n_layers,\n            seq_len=input_size,\n            optimizer_name=optimizer_name,\n            optimizer_kwargs=optimizer_params,\n            loss_fn_name=loss_fn_name,\n            epochs=-1,\n            predicted_tokens=-1,\n        )\n        self.model = model\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/operations/data.py",
    "content": "import urllib.request\nfrom typing import Any\n\nimport torch\nimport torch.utils.data\nfrom nebullvm.operations.base import Operation\nfrom torchvision import datasets, transforms\n\n\nclass MNISTDataLoaderOperation(Operation):\n    \"\"\"DataLoaderOperation\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self.train_data = None\n        self.test_data = None\n\n    def get_result(self) -> Any:\n        if self.train_data is not None:\n            return self.train_data, self.test_data\n        else:\n            return None\n\n    def execute(self, batch_size: int, shuffle: bool):\n        train_loader = torch.utils.data.DataLoader(\n            datasets.MNIST(\n                \"data\",\n                train=True,\n                download=True,\n                transform=transforms.Compose(\n                    [\n                        transforms.ToTensor(),\n                        transforms.Normalize((0.1307,), (0.3081,)),\n                    ]\n                ),\n            ),\n            batch_size=batch_size,\n            shuffle=shuffle,\n        )\n        test_loader = torch.utils.data.DataLoader(\n            datasets.MNIST(\n                \"data\",\n                train=False,\n                transform=transforms.Compose(\n                    [\n                        transforms.ToTensor(),\n                        transforms.Normalize((0.1307,), (0.3081,)),\n                    ]\n                ),\n            ),\n            batch_size=1000,\n            shuffle=False,\n        )\n        self.train_data = train_loader\n        self.test_data = test_loader\n\n\ndef download_fables():\n    http_str = \"http://classics.mit.edu/Aesop/fab.mb.txt\"\n    with urllib.request.urlopen(http_str) as response:\n        html = response.read()\n    return html.decode(\"utf-8\")\n\n\ndef get_fables():\n    fables = download_fables()\n    fables = fables.split(\"SECTION 1\")[1]\n    fables = fables.split(\"THE END\")[0]\n    fables = fables.split(\"\\n\\n\")\n    fables = [fable for fable in fables if len(fable) >= 100]\n    return fables\n\n\nVOCABULARY = {\n    \" \": 0,\n    \"!\": 1,\n    \",\": 2,\n    \".\": 3,\n    \"a\": 4,\n    \"b\": 5,\n    \"c\": 6,\n    \"d\": 7,\n    \"e\": 8,\n    \"f\": 9,\n    \"g\": 10,\n    \"h\": 11,\n    \"i\": 12,\n    \"j\": 13,\n    \"k\": 14,\n    \"l\": 15,\n    \"m\": 16,\n    \"n\": 17,\n    \"o\": 18,\n    \"p\": 19,\n    \"q\": 20,\n    \"r\": 21,\n    \"s\": 22,\n    \"t\": 23,\n    \"u\": 24,\n    \"v\": 25,\n    \"w\": 26,\n    \"x\": 27,\n    \"y\": 28,\n    \"z\": 29,\n}\n\n\ndef tokenize(fable, max_len=100):\n    tokenized_fable = [\n        VOCABULARY[char]\n        for i, char in enumerate(fable.lower())\n        if char in VOCABULARY\n    ]\n    return tokenized_fable[:max_len]\n\n\ndef get_tokenized_fables():\n    fables = get_fables()\n    tokenized_fables = [tokenize(fable) for fable in fables]\n    tokenized_fables = torch.stack(\n        [\n            torch.tensor(tokens)\n            for tokens in tokenized_fables\n            if len(tokens) == 100\n        ]\n    )\n    return tokenized_fables\n\n\ndef get_dataloader(batch_size=32, test_size=0.2, shuffle=True):\n    tokenized_fables = get_tokenized_fables()\n    n_test = int(len(tokenized_fables) * test_size)\n    test_set = torch.utils.data.TensorDataset(tokenized_fables[:n_test])\n    train_set = torch.utils.data.TensorDataset(tokenized_fables[n_test:])\n    train_loader = torch.utils.data.DataLoader(\n        train_set, batch_size=batch_size, shuffle=shuffle\n    )\n    test_loader = torch.utils.data.DataLoader(\n        test_set, batch_size=n_test, shuffle=False\n    )\n    return train_loader, test_loader\n\n\nclass AesopFablesDataLoaderOperation(Operation):\n    \"\"\"DataLoaderOperation\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self.train_data = None\n        self.test_data = None\n\n    def get_result(self) -> Any:\n        if self.train_data is not None:\n            return self.train_data, self.test_data\n        else:\n            return None\n\n    def execute(self, batch_size: int, shuffle: bool):\n        train_loader, test_loader = get_dataloader(\n            batch_size=batch_size, test_size=0.2, shuffle=shuffle\n        )\n        self.train_data = train_loader\n        self.test_data = test_loader\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/operations/fetch_operations.py",
    "content": "from typing import Any\n\nfrom nebullvm.operations.base import Operation\nfrom torch.utils.data import DataLoader\n\n\nclass FetchTrainingDataFromLocal(Operation):\n    def get_result(self) -> Any:\n        pass\n\n    def execute(self, train_data: DataLoader, test_data: DataLoader):\n        self.state[\"train_data\"] = train_data\n        self.state[\"test_data\"] = test_data\n\n    def get_train_data(self) -> DataLoader:\n        return self.state.get(\"train_data\")\n\n    def get_test_data(self) -> DataLoader:\n        return self.state.get(\"test_data\")\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/operations/trainers.py",
    "content": "from abc import ABC, abstractmethod\n\nimport torch\nfrom nebullvm.operations.base import Operation\nfrom nebullvm.operations.fetch_operations.local import FetchModelFromLocal\nfrom torch.utils.data import DataLoader\nfrom torchvision import datasets\n\nfrom forward_forward.operations.data import VOCABULARY\nfrom forward_forward.operations.fetch_operations import (\n    FetchTrainingDataFromLocal,\n)\nfrom forward_forward.utils.labels import LabelsInjector\nfrom forward_forward.utils.modules import FCNetFFProgressive\nfrom forward_forward.utils.utils import (\n    ProgressiveTrainingDataset,\n    compute_perplexity,\n)\n\n\nclass BaseForwardForwardTrainer(Operation, ABC):\n    def __init__(self):\n        super().__init__()\n        self.model = None\n        self.train_data = None\n        self.test_data = None\n\n        self.fetch_model_op = FetchModelFromLocal()\n        self.fetch_data_op = FetchTrainingDataFromLocal()\n\n    def get_result(self):\n        if self.state.get(\"model_is_trained\"):\n            return self.model\n\n    def execute(\n        self,\n        model: FCNetFFProgressive,\n        train_data: DataLoader,\n        test_data: DataLoader,\n        epochs: int,\n        theta: float,\n        device: str,\n        **kwargs,\n    ):\n        if self.fetch_model_op.get_model() is None:\n            self.fetch_model_op.execute(model)\n\n        if self.fetch_data_op.get_train_data() is None:\n            self.fetch_data_op.execute(train_data, test_data)\n\n        self.model = self.fetch_model_op.get_model()\n        self.train_data = self.fetch_data_op.get_train_data()\n        self.test_data = self.fetch_data_op.get_test_data()\n\n        if (\n            self.model is not None\n            and self.train_data is not None\n            and self.test_data is not None\n        ):\n            self._train(epochs, theta, device, **kwargs)\n\n    @abstractmethod\n    def _train(self, *args, **kwargs):\n        raise NotImplementedError\n\n\nclass ForwardForwardTrainer(BaseForwardForwardTrainer):\n    def _train(self, epochs: int, theta: float, device: str, **kwargs):\n        # Define model\n        model = self.model.to(device)\n        model.epochs = epochs\n        batch_size = self.train_data.batch_size\n\n        # TODO: SELECT THE N_CLASSES OUTSIDE THE OPERATION\n        label_injector = LabelsInjector(datasets.MNIST.classes)\n\n        progressive_dataset = ProgressiveTrainingDataset(\n            (label_injector.inject_train(x, y) for x, y in self.train_data)\n        )\n        progressive_dataloader = torch.utils.data.DataLoader(\n            progressive_dataset, batch_size=2 * batch_size, shuffle=False\n        )\n\n        model.train()\n        model.progressive_train(progressive_dataloader, theta)\n\n        model.eval()\n        correct = 0\n        with torch.no_grad():\n            for data, target in self.test_data:\n                input_data = label_injector.inject_eval(data)\n                input_data = input_data.to(device)\n                target = target.to(device)\n                input_shapes = input_data.shape[:-1]\n                input_data = input_data.reshape(-1, input_data.shape[-1])\n                _, prob = model.positive_eval(input_data, theta)\n                prob = prob.reshape(*input_shapes)\n                pred = prob.argmax(dim=1)\n                correct += (pred == target).float().sum().item()\n        if isinstance(correct, torch.Tensor):\n            correct = correct.item()\n        self.logger.info(\n            \"Test set: Accuracy: {}/{} ({:.0f}%)\".format(\n                correct,\n                len(self.test_data.dataset),\n                100.0 * correct / len(self.test_data.dataset),\n            )\n        )\n\n\nclass RecurrentForwardForwardTrainer(BaseForwardForwardTrainer):\n    def _train(self, epochs: int, theta: float, device: str, **kwargs):\n        model = self.model.to(device)\n\n        for epoch in range(epochs):\n            accumulated_goodness = None\n            model.train()\n            for j, (data, target) in enumerate(self.train_data):\n                # TODO: THE IMAGE SHAPE SHOULD NOT BE DEFINED HERE\n                data = data.to(device).reshape(-1, 28 * 28)\n                target = torch.functional.F.one_hot(\n                    target.to(device),\n                    num_classes=len(datasets.MNIST.classes),\n                )\n                _, goodness = model.ff_train(data, target, theta)\n                if accumulated_goodness is None:\n                    accumulated_goodness = goodness\n                else:\n                    accumulated_goodness[0] += goodness[0]\n                    accumulated_goodness[1] += goodness[1]\n            goodness_ratio = (\n                accumulated_goodness[0] - accumulated_goodness[1]\n            ) / abs(max(accumulated_goodness))\n            self.logger.info(f\"Epoch {epoch + 1}\")\n            self.logger.info(f\"Accumulated goodness: {accumulated_goodness}\")\n            self.logger.info(f\"Goodness ratio: {goodness_ratio}\")\n            model.eval()\n            correct = 0\n            with torch.no_grad():\n                for data, target in self.test_data:\n                    data = data.to(device).reshape(-1, 28 * 28)\n                    target = target.to(device)\n                    pred, _ = model.positive_eval(data, theta)\n                    correct += pred.eq(target.view_as(pred)).sum().item()\n            self.logger.info(\n                f\"Test accuracy: {correct} / 10000 ({correct / 10000 * 100}%)\"\n            )\n\n\nclass NLPForwardForwardTrainer(BaseForwardForwardTrainer):\n    def _train(\n        self,\n        epochs: int,\n        theta: float,\n        device: str,\n        predicted_tokens: int,\n        **kwargs,\n    ):\n        model = self.model.to(device)\n        self.model.epochs = epochs\n        self.model.predicted_tokens = predicted_tokens\n        token_num = len(VOCABULARY)\n        sequence_len = self.model.seq_len\n\n        for input_data in self.train_data:\n            input_data = torch.functional.F.one_hot(\n                input_data[0].to(device), num_classes=token_num\n            ).float()\n\n            accumulated_goodness = model.LM_ff_train(input_data, theta=theta)\n            goodness_ratio = (\n                accumulated_goodness[0] - accumulated_goodness[1]\n            ) / abs(max(accumulated_goodness))\n            self.logger.info(\"Trained on batch\")\n            self.logger.info(f\"Accumulated goodness: {accumulated_goodness}\")\n            self.logger.info(f\"Accumulated goodness ratio: {goodness_ratio}\")\n\n        for test_data in self.test_data:\n            test_data = torch.functional.F.one_hot(\n                test_data[0].to(device), num_classes=token_num\n            ).float()\n            test_data = test_data.reshape(-1, token_num * sequence_len)\n            predictions, _ = model.positive_eval(test_data, theta)\n            perplexity = compute_perplexity(predictions)\n            self.logger.info(f\"Perplexity: {perplexity}\")\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/root_op.py",
    "content": "from enum import Enum\n\nfrom nebullvm.operations.base import Operation\n\nfrom forward_forward.operations.build_models import (\n    FCNetFFProgressiveBuildOperation,\n    RecurrentFCNetFFBuildOperation,\n    LMFFNetBuildOperation,\n)\nfrom forward_forward.operations.data import (\n    MNISTDataLoaderOperation,\n    AesopFablesDataLoaderOperation,\n)\nfrom forward_forward.operations.trainers import (\n    ForwardForwardTrainer,\n    RecurrentForwardForwardTrainer,\n    NLPForwardForwardTrainer,\n)\n\n\nclass ForwardForwardModelType(Enum):\n    PROGRESSIVE = \"progressive\"\n    RECURRENT = \"recurrent\"\n    NLP = \"nlp\"\n\n\nclass ForwardForwardRootOp(Operation):\n    def __init__(self, model_type: ForwardForwardModelType):\n        super().__init__()\n\n        if model_type is ForwardForwardModelType.PROGRESSIVE:\n            self.build_model = FCNetFFProgressiveBuildOperation()\n            self.train_model = ForwardForwardTrainer()\n            self.load_data = MNISTDataLoaderOperation()\n        elif model_type is ForwardForwardModelType.RECURRENT:\n            self.build_model = RecurrentFCNetFFBuildOperation()\n            self.train_model = RecurrentForwardForwardTrainer()\n            self.load_data = MNISTDataLoaderOperation()\n        elif model_type is ForwardForwardModelType.NLP:\n            self.build_model = LMFFNetBuildOperation()\n            self.train_model = NLPForwardForwardTrainer()\n            self.load_data = AesopFablesDataLoaderOperation()\n\n    def execute(\n        self,\n        input_size: int,\n        n_layers: int,\n        hidden_size: int,\n        optimizer_name: str,\n        optimizer_params: dict,\n        loss_fn_name: str,\n        batch_size: int,\n        epochs: int,\n        shuffle: bool,\n        theta: float,\n        device: str,\n        output_size: int = None,\n        **kwargs,\n    ):\n        if self.build_model.get_result() is None:\n            self.build_model.execute(\n                input_size=input_size,\n                n_layers=n_layers,\n                hidden_size=hidden_size,\n                optimizer_name=optimizer_name,\n                optimizer_params=optimizer_params,\n                loss_fn_name=loss_fn_name,\n                output_size=output_size,\n            )\n\n        if self.load_data.get_result() is None:\n            self.load_data.execute(batch_size=batch_size, shuffle=shuffle)\n\n        if (\n            self.build_model.get_result() is not None\n            and self.load_data.get_result() is not None\n        ):\n            if self.train_model.get_result() is None:\n                train_loader, test_loader = self.load_data.get_result()\n                self.train_model.execute(\n                    model=self.build_model.get_result(),\n                    train_data=train_loader,\n                    test_data=test_loader,\n                    epochs=epochs,\n                    theta=theta,\n                    device=device,\n                    **kwargs,\n                )\n            if self.train_model.get_result() is not None:\n                self.state[\"model\"] = self.train_model.get_result()\n\n    def get_result(self):\n        return self.state.get(\"model\")\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/utils/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/forward_forward/forward_forward/utils/labels.py",
    "content": "from typing import List\n\nimport torch\n\n\nclass LabelsInjector:\n    def __init__(self, labels: List):\n        # save labels into a dict having label as key and a tensor of size\n        #  len(labels) as value. The tensor contains ones up to the index of\n        #  the label and zeros after.\n        self.label_names = labels\n        self.labels = [\n            torch.nn.functional.one_hot(\n                torch.tensor([i]), len(labels)\n            ).reshape(-1)\n            for i in range(len(labels))\n        ]\n\n    @torch.no_grad()\n    def inject_train(self, input_image: torch.Tensor, labels: torch.Tensor):\n        # inject label in the input image\n        bs = input_image.shape[0]\n        injecting_labels = torch.stack(\n            [self.labels[label] for label in labels]\n        )\n        negative_injecting_labels = torch.stack(\n            [\n                self.labels[label]\n                for label in select_random_different_label(\n                    labels, len(self.labels)\n                )\n            ]\n        )\n        positive_images = torch.cat(\n            [input_image.reshape(bs, -1), injecting_labels], dim=1\n        )\n        negative_images = torch.cat(\n            [input_image.reshape(bs, -1), negative_injecting_labels], dim=1\n        )\n        images = torch.cat([positive_images, negative_images], dim=0)\n        signs = torch.cat([torch.ones(bs), -torch.ones(bs)], dim=0)\n        return images, signs\n\n    @torch.no_grad()\n    def inject_eval(self, input_image: torch.Tensor):\n        # input image is expected to have batch size 1\n        # TODO: FIX THIS BEHAVIOUR\n        labels = torch.stack(self.labels).unsqueeze(0)\n        labels = labels.repeat(input_image.shape[0], 1, 1)\n        input_image = input_image.reshape(input_image.shape[0], -1).unsqueeze(\n            1\n        )\n        replicated_input = input_image.repeat(1, len(self.labels), 1)\n        new_input = torch.cat([replicated_input, labels], dim=2)\n        return new_input  # .reshape(-1, new_input.shape[2])\n\n\ndef select_random_different_label(labels: torch.Tensor, n_classes: int):\n    # select a random label different from the given one\n    for label in enumerate(labels):\n        samples = torch.randint(0, n_classes, (1,))\n        while samples[0] == label:\n            samples = torch.randint(0, n_classes, (1,))\n        yield samples[0]\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/utils/modules.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import List\n\nimport torch\nimport torch.utils.data\n\nfrom forward_forward.utils.utils import ProgressiveTrainingDataset\n\n\ndef loss_fn(y, theta, sign):\n    logits = torch.square(y).mean(dim=1) - theta\n    loss = -logits * sign\n    with torch.no_grad():\n        accumulated_logits = logits.mean().item()\n    loss = loss.mean()\n    return loss, accumulated_logits\n\n\ndef probabilistic_loss_fn(y, theta, sign):\n    logits = torch.square(y).mean(dim=1) - theta\n    prob = torch.sigmoid(logits)\n    loss = -torch.log(prob + 1e-6) * sign\n    with torch.no_grad():\n        accumulated_logits = logits.mean().item()\n    loss = loss.mean()\n    return loss, accumulated_logits\n\n\ndef alternative_loss_fn(y, theta, sign):\n    logits = y.pow(2).mean(dim=1) - theta\n    with torch.no_grad():\n        accumulated_logits = logits.mean().item()\n    logits = -logits * sign\n    prob = torch.nan_to_num(torch.exp(logits))\n    loss = torch.log(1 + prob)\n    loss = loss.mean()\n    return loss, accumulated_logits\n\n\nclass BaseFFLayer(torch.nn.Module, ABC):\n    @abstractmethod\n    def ff_train(\n        self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float\n    ):\n        raise NotImplementedError\n\n    @abstractmethod\n    def positive_eval(self, input_tensor: torch.Tensor, theta: float):\n        raise NotImplementedError\n\n    @property\n    def requires_training(self):\n        return True\n\n\nclass FFLayer(BaseFFLayer):\n    \"\"\"Layer wrapper for efficient forward-forward layers.\"\"\"\n\n    def __init__(\n        self,\n        layer,\n        optimizer_name: str,\n        optimizer_kwargs: dict,\n        loss_fn_name: str = \"loss_fn\",\n    ):\n        super().__init__()\n        self.layer = layer\n        self.optimizer = getattr(torch.optim, optimizer_name)(\n            layer.parameters(), **optimizer_kwargs\n        )\n        if loss_fn_name == \"loss_fn\":\n            self.loss_fn = loss_fn\n        elif loss_fn_name == \"alternative_loss_fn\":\n            self.loss_fn = alternative_loss_fn\n        elif loss_fn_name == \"probabilistic_loss_fn\":\n            self.loss_fn = probabilistic_loss_fn\n\n    def forward(self, x):\n        return self.layer(x)\n\n    def ff_train(\n        self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float\n    ):\n        \"\"\"Train the layer with the given target.\"\"\"\n        # upgrade optimizer for positive goodness\n        y = self(input_tensor.detach())\n        y_pos = y[torch.where(signs == 1)]\n        y_neg = y[torch.where(signs == -1)]\n        # y_pos = self(input_tensor.detach()[torch.where(signs == 1)])\n        loss_pos, cumulated_logits_pos = self.loss_fn(y_pos, theta, sign=1)\n        # self.optimizer.zero_grad()\n        # loss_pos.backward()\n        # print(loss_pos.item())\n        # self.optimizer.step()\n        # y_neg = self(input_tensor.detach()[torch.where(signs == -1)])\n        loss_neg, cumulated_logits_neg = self.loss_fn(y_neg, theta, sign=-1)\n        self.optimizer.zero_grad()\n        loss = loss_pos + loss_neg\n        loss.backward()\n        self.optimizer.step()\n        separation = [cumulated_logits_pos, cumulated_logits_neg]\n        y = torch.zeros(\n            input_tensor.shape[0], *y_pos.shape[1:], device=input_tensor.device\n        )\n        y[torch.where(signs == 1)] = y_pos\n        y[torch.where(signs == -1)] = y_neg\n        return y.detach(), separation\n\n    @torch.no_grad()\n    def positive_eval(self, input_tensor: torch.Tensor, theta: float):\n        \"\"\"Evaluate the layer with the given input and theta.\"\"\"\n        y = self(input_tensor)\n        return y, torch.square(y).mean(dim=1) - theta\n\n\nclass FFNormalization(BaseFFLayer):\n    def __init__(self):\n        super().__init__()\n\n    def forward(self, x):\n        l2_norm = (\n            torch.norm(x.reshape(x.shape[0], -1), p=2, dim=1, keepdim=True)\n            + 1e-8\n        )\n        return x / l2_norm\n\n    def ff_train(\n        self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float\n    ):\n        with torch.no_grad():\n            output = self()\n        return output, None\n\n    @torch.no_grad()\n    def positive_eval(self, input_tensor: torch.Tensor, theta: float):\n        with torch.no_grad():\n            output = self(input_tensor)\n\n        return output, torch.zeros(\n            input_tensor.shape[0], device=input_tensor.device\n        )\n\n    @property\n    def requires_training(self):\n        return False\n\n\nclass LinearReLU(torch.nn.Module):\n    def __init__(self, in_features, out_features):\n        super().__init__()\n        self.linear = torch.nn.Linear(in_features, out_features, bias=True)\n        self.relu = torch.nn.ReLU()\n\n    def forward(self, x):\n        return self.relu(self.linear(x))\n\n\nclass FCNetFFProgressive(BaseFFLayer):\n    \"\"\"FCNet trained using forward-forward algorithm. The network is trained\n    in a progressive manner, i.e. the first layer is trained, then the\n    second layer, and so on.\n    \"\"\"\n\n    def __init__(\n        self,\n        layer_sizes: list,\n        optimizer_name: str,\n        optimizer_kwargs: dict,\n        epochs: int,\n        loss_fn_name: str = \"loss_fn\",\n    ):\n        super().__init__()\n        self.epochs = epochs\n        self.layers = torch.nn.ModuleList()\n        for i in range(len(layer_sizes) - 1):\n            self.layers.append(FFNormalization())\n            self.layers.append(\n                FFLayer(\n                    LinearReLU(layer_sizes[i], layer_sizes[i + 1]),\n                    optimizer_name,\n                    optimizer_kwargs,\n                    loss_fn_name,\n                )\n            )\n\n    def forward(self, x):\n        for layer in self.layers:\n            x = layer(x)\n        return x\n\n    def progressive_train(self, dl: torch.utils.data.DataLoader, theta: float):\n        \"\"\"Train the network in a progressive manner.\"\"\"\n        print(\"Training the network in a progressive manner.\")\n        for i, layer in enumerate(self.layers):\n            if layer.requires_training:\n                for epoch in range(self.epochs):\n                    accumulated_separation = None\n                    for j, (data, signs) in enumerate(dl):\n                        data = data.to(self.device)\n                        signs = signs.to(self.device)\n                        _, separation = layer.ff_train(data, signs, theta)\n                        if accumulated_separation is None:\n                            accumulated_separation = separation\n                        else:\n                            accumulated_separation[0] += separation[0]\n                            accumulated_separation[1] += separation[1]\n                        if j % 100 == 0:\n                            print(f\"Epoch: {epoch}, Batch: {j}, Layer: {i}\")\n                    print(f\"Epoch {epoch} of layer {i} done.\")\n                    accumulated_separation[0] /= len(dl.dataset)\n                    accumulated_separation[1] /= len(dl.dataset)\n                    separation_ratio = (\n                        accumulated_separation[0] - accumulated_separation[1]\n                    ) / abs(max(accumulated_separation))\n                    print(\"Goodness: \", accumulated_separation)\n                    print(f\"Accumulated separation: {separation_ratio}\")\n                print(f\"Finished training layer {i} / {len(self.layers)}.\")\n            # create a new dataloader for the next layer\n            dataset = ProgressiveTrainingDataset(\n                (\n                    (layer(x.to(self.device)), sign.to(self.device))\n                    for x, sign in dl\n                )\n            )\n            batch_size = dl.batch_size\n            dl = torch.utils.data.DataLoader(\n                dataset, batch_size=batch_size, shuffle=False\n            )\n        print(\"Finished training the network.\")\n\n    def ff_train(\n        self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float\n    ):\n        \"\"\"Train the network with the given target.\"\"\"\n        accumulated_separation = None\n        for layer in self.layers:\n            input_tensor, separation = layer.ff_train(\n                input_tensor, signs, theta\n            )\n            if accumulated_separation is None:\n                accumulated_separation = separation\n            else:\n                accumulated_separation[0] += separation[0]\n                accumulated_separation[1] += separation[1]\n        return input_tensor, accumulated_separation\n\n    @torch.no_grad()\n    def positive_eval(self, input_tensor: torch.Tensor, theta: float):\n        \"\"\"Evaluate the network with the given input and theta.\"\"\"\n        accumulated_goodness = torch.zeros(\n            input_tensor.shape[0], device=input_tensor.device\n        )\n        for i, layer in enumerate(self.layers):\n            input_tensor, goodness = layer.positive_eval(input_tensor, theta)\n            if i > 1:\n                accumulated_goodness += goodness\n        return input_tensor, accumulated_goodness\n\n    @property\n    def device(self):\n        return next(self.parameters()).device\n\n\nclass NormLinearReLU(torch.nn.Module):\n    def __init__(self, in_features, out_features):\n        super().__init__()\n        self.norm = FFNormalization()\n        self.linear_relu = LinearReLU(in_features, out_features)\n\n    def forward(self, x):\n        return self.linear_relu(self.norm(x))\n\n\nclass RecurrentFFLayer(BaseFFLayer):\n    def __init__(\n        self,\n        hidden_size: int,\n        optimizer_name: str,\n        optimizer_kwargs: dict,\n        loss_fn_name: str,\n    ):\n        super().__init__()\n        self.layer = NormLinearReLU(2 * hidden_size, hidden_size)\n        self.optimizer = getattr(torch.optim, optimizer_name)(\n            self.layer.parameters(), **optimizer_kwargs\n        )\n        self.loss_fn = eval(loss_fn_name)\n\n    def forward(self, x_prev, x_same, x_next):\n        x = torch.cat((x_prev, x_next), dim=1)\n        new_x = self.layer(x)\n        new_x = 0.3 * x_same + 0.7 * new_x\n        return new_x\n\n    def ff_train(\n        self,\n        x_prev: torch.Tensor,\n        x_same: torch.Tensor,\n        x_next: torch.Tensor,\n        signs: torch.Tensor,\n        theta: float,\n    ):\n        new_x = self(x_prev.detach(), x_same.detach(), x_next.detach())\n        y_pos = new_x[signs == 1]\n        y_neg = new_x[signs == -1]\n        loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)\n        loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)\n        loss = loss_pos + loss_neg\n        self.optimizer.zero_grad()\n        loss.backward()\n        self.optimizer.step()\n        return new_x, [goodness_pos, goodness_neg]\n\n    @torch.no_grad()\n    def positive_eval(\n        self,\n        x_prev: torch.Tensor,\n        x_same: torch.Tensor,\n        x_next: torch.Tensor,\n        theta: float,\n    ):\n        new_x = self(x_prev, x_same, x_next)\n        goodness = new_x.pow(2).mean(dim=1) - theta\n        return new_x, goodness\n\n\nclass RecurrentProjectionFFLayer(BaseFFLayer):\n    def __init__(\n        self,\n        input_size: int,\n        output_size: int,\n        optimizer_name: str,\n        optimizer_kwargs: dict,\n        loss_fn_name: str,\n    ):\n        super().__init__()\n        self.layer = NormLinearReLU(input_size, output_size)\n        self.optimizer = getattr(torch.optim, optimizer_name)(\n            self.layer.parameters(), **optimizer_kwargs\n        )\n        self.loss_fn = eval(loss_fn_name)\n\n    def forward(self, x: torch.Tensor):\n        return self.layer(x)\n\n    def ff_train(\n        self,\n        x: torch.Tensor,\n        signs: torch.Tensor,\n        theta: float,\n    ):\n        new_x = self(x.detach())\n        y_pos = new_x[signs == 1]\n        y_neg = new_x[signs == -1]\n        loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)\n        loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)\n        loss = loss_pos + loss_neg\n        self.optimizer.zero_grad()\n        loss.backward()\n        self.optimizer.step()\n        return new_x, [goodness_pos, goodness_neg]\n\n    @torch.no_grad()\n    def positive_eval(self, x: torch.Tensor, theta: float):\n        new_x = self(x)\n        goodness = new_x.pow(2).mean(dim=1) - theta\n        return new_x, goodness\n\n\nclass RecurrentProjectedSoftmaxFFLayer(BaseFFLayer):\n    def __init__(\n        self,\n        input_size: int,\n        output_size: int,\n        optimizer_name: str,\n        optimizer_kwargs: dict,\n        loss_fn_name: str,\n    ):\n        super().__init__()\n        self.loss_fn = eval(loss_fn_name)\n        self.norm = FFNormalization()\n        self.linear = torch.nn.Linear(input_size, output_size)\n        self.softmax = torch.nn.Softmax(dim=1)\n        self.optimizer = getattr(torch.optim, optimizer_name)(\n            self.linear.parameters(), **optimizer_kwargs\n        )\n\n    def forward(self, x: torch.Tensor):\n        x = self.norm(x)\n        x = self.linear(x)\n        x = self.softmax(x)\n        return x\n\n    def ff_train(\n        self,\n        x: torch.Tensor,\n        signs: torch.Tensor,\n        theta: float,\n    ):\n        new_x = self(x.detach())\n        y_pos = new_x[signs == 1]\n        y_neg = new_x[signs == -1]\n        loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)\n        loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)\n        loss = loss_pos + loss_neg\n        self.optimizer.zero_grad()\n        loss.backward()\n        self.optimizer.step()\n        return new_x, [goodness_pos, goodness_neg]\n\n    @torch.no_grad()\n    def positive_eval(self, x: torch.Tensor, theta: float):\n        new_x = self(x)\n        goodness = new_x.pow(2).mean(dim=1) - theta\n        return new_x, goodness\n\n\nclass RecurrentFCNetFF(BaseFFLayer):\n    \"\"\"Recurrent FCNet trained using forward-forward algorithm.\"\"\"\n\n    def __init__(\n        self,\n        layer_sizes: list,\n        optimizer_name: str,\n        optimizer_kwargs: dict,\n        loss_fn_name: str = \"loss_fn\",\n    ):\n        super().__init__()\n        self.time_steps = 8\n        self.test_time_steps = 8\n        self.storable_time_steps = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n        # self.storable_time_steps = [3, 4, 5]\n        self.states = []\n        self.layers = torch.nn.ModuleList()\n        self.projector = RecurrentProjectionFFLayer(\n            layer_sizes[0],\n            layer_sizes[1],\n            optimizer_name,\n            optimizer_kwargs,\n            loss_fn_name,\n        )\n        for i in range(1, len(layer_sizes) - 1):\n            self.layers.append(\n                RecurrentFFLayer(\n                    layer_sizes[i],\n                    optimizer_name,\n                    optimizer_kwargs,\n                    loss_fn_name,\n                )\n            )\n        self.proj_y = RecurrentProjectionFFLayer(\n            layer_sizes[-1],\n            layer_sizes[-2],\n            optimizer_name,\n            optimizer_kwargs,\n            loss_fn_name,\n        )\n        self.softmax = RecurrentProjectedSoftmaxFFLayer(\n            layer_sizes[-2],\n            layer_sizes[-1],\n            optimizer_name,\n            optimizer_kwargs,\n            loss_fn_name,\n        )\n        self.num_labels = layer_sizes[-1]\n\n    @property\n    def device(self):\n        return next(self.parameters()).device\n\n    @torch.no_grad()\n    def bottom_up(self, x: torch.Tensor, y: torch.Tensor):\n        states = []\n        x_proj = self.projector(x)\n\n        for layer in self.layers:\n            states.append(x_proj)\n            x_proj = layer(\n                x_proj,\n                torch.zeros_like(x_proj, device=self.device),\n                torch.zeros_like(x_proj, device=self.device),\n            )\n        states.append(x_proj)\n        states.append(y)\n        y_arg = torch.argmax(y, dim=1)\n        x_proj_ = x_proj.clone()\n        x_proj_[torch.arange(x_proj.shape[0]), y_arg] = -1e6\n        neg_prob = self.softmax(x_proj_)\n        cumulative_neg_prob = torch.cumsum(neg_prob, dim=1)\n        neg_samples = torch.argmax(\n            1.0\n            * (\n                cumulative_neg_prob > torch.rand(x.shape[0], 1).to(self.device)\n            ),\n            dim=1,\n        )\n        neg_samples = torch.functional.F.one_hot(\n            neg_samples, num_classes=self.num_labels\n        )\n        return states, neg_samples\n\n    def forward(self, x: torch.Tensor, prev_states: List[torch.Tensor]):\n        x_proj = self.projector(x)\n        new_states = []\n        for i, layer in enumerate(self.layers):\n            if i < len(self.layers) - 1:\n                next_state = prev_states[i + 2]\n            else:\n                next_state = self.proj_y(prev_states[i + 2].float())\n            new_states.append(x_proj)\n            x_proj = layer(prev_states[i], prev_states[i + 1], next_state)\n        new_states.append(x_proj)\n        y = self.softmax(x_proj)\n        new_states.append(y)\n        return new_states\n\n    def ff_train(\n        self, input_tensor: torch.Tensor, labels: torch.Tensor, theta: float\n    ):\n        \"\"\"Train the network with the given target.\"\"\"\n        with torch.no_grad():\n            states, neg_samples = self.bottom_up(input_tensor, labels)\n            neg_states, _ = self.bottom_up(input_tensor, neg_samples)\n            states = [\n                torch.cat([s, ns], dim=0) for s, ns in zip(states, neg_states)\n            ]\n            signs = torch.cat(\n                [\n                    torch.ones(input_tensor.shape[0], device=self.device),\n                    -torch.ones(input_tensor.shape[0], device=self.device),\n                ],\n                dim=0,\n            )\n            input_tensor = torch.cat([input_tensor, input_tensor], dim=0)\n        # states have been created, now we can train the network\n        x_proj, accumulated_goodness = self.projector.ff_train(\n            input_tensor, signs, theta\n        )\n        for _ in range(self.time_steps):\n            new_states = []\n            x = x_proj\n            for j, layer in enumerate(self.layers):\n                if j < len(self.layers) - 1:\n                    next_state = states[j + 2]\n                else:\n                    next_state = self.proj_y(states[j + 2].float())\n                new_states.append(x)\n                x, goodnesses = layer.ff_train(\n                    states[j], states[j + 1], next_state, signs, theta\n                )\n                accumulated_goodness[0] += goodnesses[0]\n                accumulated_goodness[1] += goodnesses[1]\n            new_states.append(x)\n            with torch.no_grad():\n                x_ = states[-2][torch.where(signs == -1)]\n                real_y = states[-1][torch.where(signs == 1)]\n                x_[\n                    torch.arange(x_.shape[0]), torch.argmax(real_y, dim=1)\n                ] = -1e6\n                y = self.softmax(x_)\n                cumulative_y = torch.cumsum(y, dim=1)\n                neg_samples = torch.argmax(\n                    1.0\n                    * (\n                        cumulative_y\n                        > torch.rand(x_.shape[0], 1).to(self.device)\n                    ),\n                    dim=1,\n                )\n                neg_samples = torch.functional.F.one_hot(\n                    neg_samples, num_classes=self.num_labels\n                )\n                # replace just negative samples\n                next_labels = states[-1].clone()\n                next_labels[torch.where(signs == -1)] = neg_samples\n                new_states.append(next_labels)\n            states = new_states\n        accumulated_goodness[0] /= self.time_steps * len(self.layers) + 1\n        accumulated_goodness[1] /= self.time_steps * len(self.layers) + 1\n        with torch.no_grad():\n            states = [t[: input_tensor.shape[0] // 2] for t in states]\n        return states, accumulated_goodness\n\n    @torch.no_grad()\n    def positive_eval(self, input_tensor: torch.Tensor, theta: float):\n        \"\"\"Evaluate the network with the given input and theta.\"\"\"\n        labels = torch.arange(0, self.num_labels, device=self.device)\n        labels = torch.functional.F.one_hot(\n            labels, num_classes=self.num_labels\n        )\n        original_bs = input_tensor.shape[0]\n        input_tensor = (\n            input_tensor.unsqueeze(1)\n            .repeat(1, self.num_labels, 1)\n            .reshape(-1, input_tensor.shape[-1])\n        )\n        labels = (\n            labels.unsqueeze(0)\n            .repeat(original_bs, 1, 1)\n            .reshape(-1, labels.shape[-1])\n        )\n\n        states, _ = self.bottom_up(input_tensor, labels)\n        x_proj, goodness = self.projector.positive_eval(input_tensor, theta)\n        accumulated_goodness = goodness\n\n        for time_step in range(self.test_time_steps):\n            new_states = []\n            x = x_proj\n            for j, layer in enumerate(self.layers):\n                if j < len(self.layers) - 1:\n                    next_state = states[j + 2]\n                else:\n                    next_state = self.proj_y(states[j + 2].float())\n                new_states.append(x)\n                x, goodnesses = layer.positive_eval(\n                    states[j], states[j + 1], next_state, theta\n                )\n                if time_step in self.storable_time_steps:\n                    accumulated_goodness += goodnesses\n            new_states.append(x)\n            if time_step in self.storable_time_steps:\n                _, goodness = self.softmax.positive_eval(x, theta)\n                accumulated_goodness += goodness\n            new_states.append(states[-1])\n            states = new_states\n        accumulated_goodness = accumulated_goodness.reshape(\n            original_bs, self.num_labels\n        )\n        prediction = torch.argmax(accumulated_goodness, dim=1)\n        return prediction, accumulated_goodness\n\n\nclass LMFFLinearSoftmax(BaseFFLayer):\n    def __init__(\n        self,\n        input_size: int,\n        output_size: int,\n        optimizer_name: str,\n        optimizer_kwargs: dict,\n    ):\n        super().__init__()\n        self.loss_fn = torch.nn.NLLLoss()\n        self.norm = FFNormalization()\n        self.linear = torch.nn.Linear(input_size, output_size)\n        self.softmax = torch.nn.Softmax(dim=1)\n        self.optimizer = getattr(torch.optim, optimizer_name)(\n            self.parameters(), **optimizer_kwargs\n        )\n\n    def forward(self, x: torch.Tensor):\n        x = self.norm(x)\n        x = self.linear(x)\n        x = self.softmax(x)\n        return x\n\n    def ff_train(\n        self,\n        input_tensor: torch.Tensor,\n        labels: torch.Tensor,\n        signs: torch.Tensor,\n    ):\n        x = input_tensor[torch.where(signs == 1)]\n        y = labels[torch.where(signs == 1)]\n        x = self(x)\n        loss = self.loss_fn(x, torch.argmax(y, dim=1))\n        self.optimizer.zero_grad()\n        loss.backward()\n        self.optimizer.step()\n        with torch.no_grad():\n            x_neg = input_tensor[torch.where(signs == -1)]\n            new_y_neg = self(x_neg)\n            new_x = torch.zeros(\n                len(input_tensor), *x.shape[1:], device=input_tensor.device\n            )\n            new_x[torch.where(signs == 1)] = x\n            new_x[torch.where(signs == -1)] = new_y_neg\n        return new_x, loss.item()\n\n    @torch.no_grad()\n    def positive_eval(self, x: torch.Tensor):\n        pred = self(x)\n        return pred\n\n\nclass LMFFNet(BaseFFLayer):\n    def __init__(\n        self,\n        token_num: int,\n        hidden_size: int,\n        n_layers: int,\n        seq_len: int,\n        predicted_tokens: int,\n        epochs: int,\n        optimizer_name: str,\n        optimizer_kwargs: dict,\n        loss_fn_name: str = \"loss_fn\",\n    ):\n        super().__init__()\n        self.token_num = token_num\n        self.hidden_size = hidden_size\n        self.seq_len = seq_len\n        self.predicted_tokens = predicted_tokens\n        self.token2emb = RecurrentProjectionFFLayer(\n            token_num * seq_len,\n            hidden_size,\n            optimizer_name,\n            optimizer_kwargs,\n            loss_fn_name,\n        )\n        self.layers = torch.nn.ModuleList(\n            [\n                FFLayer(\n                    NormLinearReLU(hidden_size, hidden_size),\n                    optimizer_name,\n                    optimizer_kwargs,\n                    loss_fn_name,\n                )\n                for _ in range(n_layers)\n            ]\n        )\n        self.emb2token = LMFFLinearSoftmax(\n            n_layers * hidden_size, token_num, optimizer_name, optimizer_kwargs\n        )\n        self.epochs = epochs\n\n    def forward(self, input_tensor: torch.Tensor):\n        x = self.token2emb(input_tensor)\n        xs = []\n        for layer in self.layers:\n            x = layer(x)\n            xs.append(x)\n        x = torch.cat(xs, dim=1)\n        x = self.emb2token(x)\n        return x\n\n    def ff_train(\n        self,\n        input_tensor: torch.Tensor,\n        prev_pred: torch.Tensor,\n        labels: torch.Tensor,\n        theta: float,\n    ):\n        signs = torch.cat(\n            [\n                torch.ones(input_tensor.shape[0], device=input_tensor.device),\n                -torch.ones(input_tensor.shape[0], device=input_tensor.device),\n            ]\n        )\n        input_tensor = torch.cat([input_tensor, prev_pred], dim=0)\n        labels = torch.cat([labels, labels], dim=0)\n        for idx in range(self.epochs):\n            x, goodness = self.token2emb.ff_train(input_tensor, signs, theta)\n            if idx % 20 == 0:\n                print(f\"Epoch {idx}: {goodness}\")\n        accumulated_goodness = goodness\n        xs = []\n        for layer in self.layers:\n            for epoch in range(self.epochs):\n                x_new, goodness = layer.ff_train(x, signs, theta)\n                if epoch % 20 == 0:\n                    print(f\"Epoch {epoch}: {goodness}\")\n            x = x_new\n            xs.append(x)\n            accumulated_goodness[0] += goodness[0]\n            accumulated_goodness[1] += goodness[1]\n        x = torch.cat(xs, dim=1)\n        for epoch in range(self.epochs):\n            x_new, loss = self.emb2token.ff_train(x, labels, signs)\n            if epoch % 20 == 0 or epoch < 20:\n                print(f\"Epoch {epoch}: {loss}\")\n        x = x_new\n        next_input = input_tensor[signs == 1].roll(-self.token_num, dims=1)\n        next_input[\n            :, -self.token_num :  # noqa E203\n        ] = torch.functional.F.one_hot(\n            torch.argmax(x[signs == 1], dim=1), num_classes=self.token_num\n        )\n        return next_input, accumulated_goodness\n\n    def LM_ff_train(self, input_tensor: torch.Tensor, theta: float):\n        with torch.no_grad():\n            input_tensor = input_tensor.reshape(\n                -1, self.token_num * self.seq_len\n            )\n            labels = input_tensor[:, -self.token_num :].roll(  # noqa E203\n                -1, dims=0\n            )\n            temp = torch.argmax(labels, dim=1)\n            print(temp.shape, torch.sum(temp == 0))\n            pred = self(input_tensor)\n            new_char = torch.functional.F.one_hot(\n                torch.argmax(pred, dim=1), num_classes=self.token_num\n            )\n            prev_pred = input_tensor.clone().roll(1)\n            prev_pred[:, -self.token_num :] = new_char  # noqa E203\n        _, accumulated_goodness = self.ff_train(\n            input_tensor, prev_pred, labels, theta\n        )\n        return accumulated_goodness\n\n    @torch.no_grad()\n    def positive_eval(self, input_tensor: torch.Tensor, theta: float):\n        cumulated_goodness = torch.zeros(\n            input_tensor.shape[0], device=input_tensor.device\n        )\n        prediction = torch.zeros(\n            input_tensor.shape[0],\n            self.predicted_tokens,\n            self.token_num,\n            device=input_tensor.device,\n        )\n        for idx in range(self.predicted_tokens):\n            x, goodness = self.token2emb.positive_eval(input_tensor, theta)\n            cumulated_goodness += goodness\n            xs = []\n            for layer in self.layers:\n                x, goodness = layer.positive_eval(x, theta)\n                xs.append(x)\n                cumulated_goodness += goodness\n            x = torch.cat(xs, dim=1)\n            x = self.emb2token.positive_eval(x)\n            prediction[:, idx] = x\n            input_tensor = input_tensor.roll(-self.token_num, dims=1)\n            input_tensor[\n                :, -self.token_num :  # noqa E203\n            ] = torch.functional.F.one_hot(\n                torch.argmax(x, dim=1), num_classes=self.token_num\n            )\n        cumulated_goodness /= self.predicted_tokens\n        return prediction, cumulated_goodness\n"
  },
  {
    "path": "optimization/forward_forward/forward_forward/utils/utils.py",
    "content": "from collections import Generator\n\nimport torch.utils.data\n\n\nclass ProgressiveTrainingDataset(torch.utils.data.Dataset):\n    \"\"\"Dataset for progressive training.\"\"\"\n\n    def __init__(self, dataset_generator: Generator):\n        with torch.no_grad():\n            self.internal_dataset = [\n                batch\n                for data, sign in dataset_generator\n                for batch in zip(data, sign)\n            ]\n\n    def __getitem__(self, index):\n        return self.internal_dataset[index]\n\n    def __len__(self):\n        return len(self.internal_dataset)\n\n\ndef compute_perplexity(tensor: torch.Tensor):\n    \"\"\"Compute perplexity of a tensor. The tensor has shape (batch_size,\n    sequence_length, vocab_size).\n    The softmax has already been computed over the vocab dimension.\n    \"\"\"\n    return torch.exp(-torch.sum(tensor * torch.log(tensor), dim=-1)).mean()\n"
  },
  {
    "path": "optimization/forward_forward/requirements.txt",
    "content": "torch>=1.9\ntorchvision>=0.10\nnebullvm>=0.6\n"
  },
  {
    "path": "optimization/forward_forward/setup.py",
    "content": "from pathlib import Path\nfrom setuptools import setup, find_packages\n\n\nREQUIREMENTS = [\n    \"torch>=1.9\",\n    \"torchvision>=0.10\",\n    \"nebullvm>=0.6\",\n]\n\nthis_directory = Path(__file__).parent\nlong_description = (this_directory / \"README.md\").read_text(encoding=\"utf8\")\n\nsetup(\n    name=\"forward_forward\",\n    version=\"0.0.1\",\n    packages=find_packages(),\n    install_requires=REQUIREMENTS,\n    long_description=long_description,\n    include_package_data=True,\n    long_description_content_type=\"text/markdown\",\n)\n"
  },
  {
    "path": "optimization/large_speedster/README.md",
    "content": "# ⚡ LargeSpeedster App (WIP)\nAutomatically apply SOTA optimization techniques on large AI models to achieve the maximum acceleration on your hardware.\n\nIf you like this App, give us a star to show your support for the project ⭐\n\n## 📚 Description\nThe LargeSpeedster App is a powerful tool to optimize large AI models (LMs). Leveraging state-of-the-art open-source optimization tools, LargeSpeedster enables the acceleration of large models, i.e. models with a number of parameters in excess of what could be stored on a single GPU. The workflow consists in 3 steps: select, search, and serve.\n\nIn the select step, users input their large model in their preferred deep learning framework and express their preferences regarding maximum consented accuracy loss. This information is used to guide the optimization process and ensure that the resulting model meets the user's needs.\n\nIn the search step, the App automatically tests multiple LMs-specific optimization techniques across the software-to-hardware stack, such as SmoothQuant quantization, FlashAttention, and inference-specific kernels. The App also tunes the optimal parallelization strategy and its configuration parameters, allowing it to find the optimal configuration of techniques for accelerating the model.\n\nFinally, in the serve step, the App returns an accelerated version of the user's model in the DL framework of choice, providing a significant boost in performance.\n\nOverall, LargeSpeedster is an easy-to-use tool that allows users to optimize their large AI models and get the most out of their software-to-hardware stack. Try it out today, and reach out if you have any feedback!\n"
  },
  {
    "path": "optimization/nebullvm/.pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/ambv/black\n    rev: 22.3.0\n    hooks:\n      - id: black\n        args: [--line-length=79]\n\n  - repo: https://github.com/pycqa/flake8\n    rev: 3.9.2\n    hooks:\n      - id: flake8\n        args: [--exclude=nebullvm/tools/diffusers.py]\n"
  },
  {
    "path": "optimization/nebullvm/CONTRIBUTING.md",
    "content": "# Guidelines for Contributing to Nebullvm 🚀\n\nHello coder 👋\n\nWe are very happy that you have decided to contribute to the library and we thank you for your efforts. Here you can find guidelines on how to standardize your code with the style we adopted for `nebullvm`.  But remember, there are various ways to help the community other than submitting code contributions, answering questions and improving the documentation are also very valuable.\n\nIt also helps us if you mention our library in your blog posts to show off the cool things it's made possible, or just give the repository a ⭐️ to show us that you appreciate the project\n\nThis guide was inspired by the awesome [Transformers](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md) guide to contributing.\n\nWe hope to come across your pull request soon!\n\nHappy coding 💫 The nebullvm Team\n\n\n## How to submit an issue\nDid you spot a bug? Did you come up with a cool idea that you think should be implemented in nebullvm? Well, GitHub issues are the best way to let us know!\n\nWe don't have a strict policy on issue generation, just use a meaningful title and specify the problem or your proposal in the first problem comment. Then, you can use GitHub labels to let us know what kind of proposal you are making, for example `bug` if you are reporting a bug or `enhancement` if you are proposing a library improvement. \n\n## How to contribute to solve an issue\nWe are always delighted to welcome other people to the contributors section of nebullvm! We are looking forward to welcoming you to the community, here are some guidelines to follow:\n1. Please [fork](https://github.com/nebuly-ai/nebullvm/fork) the [library](https://github.com/nebuly-ai/nebullvm) by clicking on the Fork button on the repository's page. This will create a copy of the repository in your GitHub account.\n2. Clone your fork to your local machine, and add the base repository as a remote:\n    ```bash\n    $ git clone git@github.com:<your Github handle>/nebuly-ai/nebullvm.git\n    $ cd nebullvm\n    $ git remote add upstream https://github.com/nebuly-ai/nebullvm.git\n    ```\n3. Install the library in editable mode with the following command:\n    ```bash\n    $ pip install -e .\n    ```\n4. Work on your fork to develop the feature you have in mind.\n5. Nebullvm relies on `black` to format its source code consistently. To use the formatting style defined for nebullvm, run the following commands:\n    ```bash\n    $ pip install pre-commit black autoflake\n    $ pre-commit install\n    # the following command is optional, but needed if you have already \n    # committed some files to your forked repo.\n    $ pre-commit run --all-files\n    ```\n    As for the naming convention, we follow [PEP 8](https://peps.python.org/pep-0008/) for code and a slight variation of [Google convention](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for docstrings. For docstrings we redundantly express the input type in both the function definition and the function docstring.\n6. Once you're happy with your changes, add changed files with git add and commit your code:\n    ```bash\n    $ git add edited_file.py\n    $ git commit -m \"Add a cool feature\"\n    ```\n7. Push your changes to your repo:\n    ```bash\n    $ git push\n    ```\n8. Now you can go to the repo you have forked on your github profile and press on **Pull Request** to open a pull request. In the pull request specify which problems it is solving. For instance, if the pull request solves `Issue #1`, the comment should be `Closes #1`. Also make the title of the pull request meaningful and self-explanatory.\n---\n\nSee you soon in the list of nebullvm contributors 🌈\n"
  },
  {
    "path": "optimization/nebullvm/Dockerfile",
    "content": "ARG STARTING_IMAGE=nvcr.io/nvidia/tensorrt:23.03-py3\nFROM ${STARTING_IMAGE}\n\nWORKDIR /\n\n# Set frontend as non-interactive\nARG DEBIAN_FRONTEND=noninteractive\n\nRUN apt-get -y update && apt-get -y upgrade\n\nRUN apt-get install ffmpeg libsm6 libxext6  -y\n\n# Install other libraries\nRUN apt-get install -y sudo wget\n\n# Install libraries\nRUN python3 -m pip install --upgrade pip \\\n    && pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu118  \\\n    && pip install --no-cache-dir tensorflow \\\n    && pip install --no-cache-dir xformers \\\n    && pip install --no-cache-dir accelerate \\\n    && python3 -m pip install --no-cache-dir --upgrade tensorrt\n\n# Copy the working dir to the container\nCOPY ../.. /nebullvm\n\n# Install nebullvm\nARG NEBULLVM_VERSION=latest\nRUN if [ \"$NEBULLVM_VERSION\" = \"latest\" ] ; then \\\n        cd nebullvm ; \\\n        pip install . ; \\\n        cd apps/accelerate/speedster ; \\\n        pip install . ; \\\n        cd ../../../.. ; \\\n        rm -rf nebullvm ; \\\n    else \\\n        pip install --no-cache-dir nebullvm==${NEBULLVM_VERSION} ; \\\n    fi\n\n# Install required python modules\nRUN pip install --no-cache-dir cmake\n\n# Install default deep learning compilers\nARG COMPILER=all\nRUN if [ \"$COMPILER\" = \"all\" ] ; then \\\n        python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers all ; \\\n    elif [ \"$COMPILER\" = \"tensorrt\" ] ; then \\\n        python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers tensorrt ; \\\n    elif [ \"$COMPILER\" = \"openvino\" ] ; then \\\n        python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers openvino ; \\\n    elif [ \"$COMPILER\" = \"onnxruntime\" ] ; then \\\n        python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers onnxruntime ; \\\n    fi\n\n# Install TVM\nRUN if [ \"$COMPILER\" = \"all\" ] || [ \"$COMPILER\" = \"tvm\" ] ; then \\\n        pip install --no-cache-dir https://github.com/tlc-pack/tlcpack/releases/download/v0.11.1/tlcpack_cu116-0.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ; \\\n        pip install --no-cache-dir xgboost ; \\\n        python3 -c \"from tvm.runtime import Module\" ; \\\n    fi\n\nENV SIGOPT_PROJECT=\"tmp\"\nENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.8/dist-packages/tensorrt\nENV CUDA_MODULE_LOADING=\"LAZY\"\n"
  },
  {
    "path": "optimization/nebullvm/LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "optimization/nebullvm/MANIFEST.in",
    "content": "recursive-include nebullvm/installers/tvm_installers *.cmake\nrecursive-include nebullvm/installers *.sh"
  },
  {
    "path": "optimization/nebullvm/README.md",
    "content": "<p align=\"center\">\n<br><br><br>\n<a https://docs.nebuly.com/welcome/quick-start\"><img src=\"https://user-images.githubusercontent.com/83510798/208247207-861541f0-b968-484c-8a0c-0fb110399c16.png\" width=\"400px\"></a>\n<br><br><br>\n</p>\n\n<p align=\"center\">\n<b>A framework for building optimization modules to boost the performances of your AI systems</b>\n</p>\n\n<p align=center>\n<a href=\"https://pypi.org/project/nebullvm/\"><img src=\"https://badge.fury.io/py/nebullvm.svg\"></a>\n<a href=\"https://pypistats.org/packages/nebullvm\"><img src=\"https://pepy.tech/badge/nebullvm\"></a>\n<a href=\"https://discord.gg/77d5kGSa8e\"><img src=\"https://img.shields.io/badge/Discord-1.1k-blueviolet?logo=discord&amp;logoColor=white&style=round\">\n<a href=\"https://twitter.com/nebuly_ai\"><img src=\"https://img.shields.io/twitter/url.svg?label=Follow%20%40nebuly_ai&style=social&url=https%3A%2F%2Ftwitter.com-nebuly_ai\"></a>\n\n\n</a>\n  \n---\n\n**Documentation**: <a href=\"https://docs.nebuly.com/\" target=\"_blank\"> docs.nebuly.com/ </a>\n\n---\n\n`Nebullvm` is a framework for building the optimization modules needed to optimize the performances of your AI systems. The optimization modules are stack-agnostic and work with any library. They are designed to be easily integrated into your system, providing a quick and seamless boost to its performance. Simply plug and play to start realizing the benefits of optimized performance right away.\n\nIf you like the idea, give us a star to show your support for the project ⭐\n\n\n## **What can this help with?**\n\nThere are multiple modules we actually provide built on top of the framework:\n\n✅ [Speedster](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster): Automatically apply the best set of SOTA optimization techniques to achieve the maximum inference speed-up on your hardware.\n\n✅ [OpenAlphaTensor](https://github.com/nebuly-ai/nebuly/tree/main/optimization/open_alpha_tensor): Increase the computational performances of an AI model with custom-generated matrix multiplication algorithm fine-tuned for your specific hardware.\n\n✅ [Forward-Forward](https://github.com/nebuly-ai/nebuly/tree/main/optimization/forward_forward): The Forward Forward algorithm is a method for training deep neural networks that replaces the backpropagation forward and backward passes with two forward passes.\n\n## Next modules and roadmap\nWe are actively working on incorporating the following modules, as requested by members of our community, in upcoming releases:\n\n- [ ]  [CloudSurfer](https://github.com/nebuly-ai/nebuly/blob/main/optimization/cloud_surfer): Automatically discover the optimal cloud configuration and hardware on AWS, GCP and Azure to run your AI models.\n- [ ]  [OptiMate](https://github.com/nebuly-ai/nebuly/blob/main/optimizatione/optimate): Interactive tool guiding savvy users in achieving the best inference performance out of a given model / hardware setup.\n\n## Contributing\nAs an open source project in a rapidly evolving field, we welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the [linked](https://docs.nebuly.com/contributions) page for more information on how to get involved.\n\n---\n\n<p align=\"center\">\n  <a href=\"https://discord.gg/RbeQMu886J\">Join the community</a> |\n  <a href=\"https://docs.nebuly.com/contributions/\">Contribute to the library</a>\n</p>\n"
  },
  {
    "path": "optimization/nebullvm/azure-pipelines.yml",
    "content": "trigger:\n  branches:\n    include:\n      - main\n  paths:\n    exclude:\n      - .github/*\n      - docs/**\n      - README.md\n      - notebooks/*\n\npool:\n  name: gpu-t4-pool\n\nvariables:\n  imageName: 'nebulydocker/nebullvm'\n\nsteps:\n\n  - script: |\n      nvidia-smi\n    displayName: 'Ensure cuda is installed correctly'\n\n  - script: |\n      pip uninstall -y nebullvm\n      pip install .\n    displayName: 'Install nebullvm'\n\n  - script: |\n      cd apps/accelerate/speedster\n      pip uninstall -y speedster\n      pip install .\n      cd ../../..\n    displayName: 'Install speedster'\n\n  - script: python -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117\n    displayName: 'Install PyTorch'\n\n  - script: |\n      export PATH=$PATH:/home/AzDevOps/.local/bin\n      python -m nebullvm.installers.auto_installer --compilers all\n    displayName: 'Install deep learning compilers'\n\n  - script: |\n      python -m pip install -r \"requirements-dev.txt\"\n      pip install pytest-azurepipelines\n    displayName: 'Install requirements for testing'\n\n  - script: |\n      res=$(python -c \"from nebullvm.tools.utils import check_device; print(check_device().type.name == 'GPU')\")\n      if [ \"$res\" = \"False\" ]; then\n          echo \"GPU is not available\"\n          exit 1\n      fi\n      echo \"GPU is available: $res\"\n      res=$(python -c \"import torch; print(torch.cuda.is_available())\")\n      if [ \"$res\" = \"False\" ]; then\n          echo \"CUDA is not available for PyTorch\"\n          exit 1\n      fi\n      echo \"CUDA is available for PyTorch: $res\"\n      res=$(python -c \"import torch; num_devices = torch.cuda.device_count(); print(num_devices is not None and isinstance(num_devices, int) and num_devices > 0)\")\n      if [ \"$res\" = \"False\" ]; then\n          echo \"No CUDA devices found\"\n          exit 1\n      fi\n      echo \"CUDA devices found: $res\"\n    displayName: 'Check GPU is available'\n\n  - script: |\n      export SPEEDSTER_DISABLE_TELEMETRY=1\n      export PATH=$PATH:/home/AzDevOps/.local/bin\n      cd apps/accelerate/speedster\n      pytest\n      cd ../../..\n    displayName: 'Run api tests'\n    failOnStderr: true\n\n  - script: |\n      export PATH=$PATH:/home/AzDevOps/.local/bin\n      cd nebullvm\n      pytest\n      cd ../\n    displayName: 'Run components tests'\n    failOnStderr: true\n"
  },
  {
    "path": "optimization/nebullvm/docker_build.sh",
    "content": "# Create image with all compilers installed\ndocker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-allcompilers .\n\n# Create an image for each compiler installed\ndocker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-onnxruntime . --build-arg COMPILER=\"onnxruntime\"\ndocker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-openvino . --build-arg COMPILER=\"openvino\"\ndocker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-tvm . --build-arg COMPILER=\"tvm\"\ndocker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-tensorrt . --build-arg COMPILER=\"tensorrt\"\n"
  },
  {
    "path": "optimization/nebullvm/docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the environment for the first two.\nSPHINXOPTS    =\nSPHINXBUILD   = sphinx-build\nSOURCEDIR     = .\nBUILDDIR      = _build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n"
  },
  {
    "path": "optimization/nebullvm/docs/README.md",
    "content": "# Documentation\nNebullvm documentation is built using Sphynx and furo! You can follow the guide below for\n## Build the docs:\n\n1. Install nebullvm according to [README.md](../../../README.md#step-1-installation-of-nebullvm-library).\n2. Install additional libraries required to build docs:\n```\npip install -r requirements-docs.txt\n```\n3. Run `make html` from this directory.\n\n"
  },
  {
    "path": "optimization/nebullvm/docs/conf.py",
    "content": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common options. For a full\n# list see the documentation:\n# https://www.sphinx-doc.org/en/master/usage/configuration.html\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\n# flake8: noqa\n\nimport os\nimport sys\n\nsys.path.insert(0, os.path.abspath(\"../../../\"))\n\n# import sphinx_rtd_theme\n\n# -- Project information -----------------------------------------------------\n\nproject = \"nebullvm\"\ncopyright = \"2022, nebuly\"\nauthor = \"nebuly\"\n\n# The full version, including alpha/beta/rc tags\n# release = \"0.3.0\"\n\n\n# -- General configuration ---------------------------------------------------\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    \"sphinx.ext.napoleon\",\n    \"sphinx.ext.autodoc\",\n    \"sphinx.ext.intersphinx\",\n    \"sphinx.ext.todo\",\n    \"sphinx.ext.coverage\",\n    \"sphinx.ext.mathjax\",\n    \"sphinx.ext.viewcode\",\n    \"sphinx.ext.githubpages\",\n]\n\n# -- Configurations for plugins ------------\nnapoleon_google_docstring = True\nnapoleon_include_init_with_doc = True\nnapoleon_include_special_with_doc = True\nnapoleon_numpy_docstring = False\nnapoleon_use_rtype = False\nautodoc_inherit_docstrings = False\nautodoc_member_order = \"bysource\"\n\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = []\n\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\n\n# html_theme = \"sphinx_rtd_theme\"\nhtml_theme = \"furo\"\n\nhtml_theme_options = {\n    \"light_css_variables\": {\n        \"color-brand-primary\": \"#dark\",\n        \"color-brand-content\": \"#dark\",\n        \"color-admonition-background\": \"#dark\",\n        \"font-stack\": \"Montserrat, sans-serif\",\n        \"font-stack--monospace\": \"Courier, monospace\",\n    },\n    \"footer_icons\": [\n        {\n            \"name\": \"GitHub\",\n            \"url\": \"https://github.com/nebuly-ai/nebullvm\",\n            \"html\": \"\"\"\n                <svg stroke=\"currentColor\" fill=\"currentColor\" stroke-width=\"0\" viewBox=\"0 0 16 16\">\n                    <path fill-rule=\"evenodd\" d=\"M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z\"></path>\n                </svg>\n            \"\"\",\n            \"class\": \"\",\n        },\n    ],\n    \"light_logo\": \"Logo_azure.svg\",\n    \"dark_logo\": \"Logo_azure.svg\",\n}\n\n\nhtml_static_path = [\"_static\"]\nhtml_title = \"\"\n\n\n# html_theme_options = {\n#    \"announcement\": \"<em>Important</em> announcement!\",\n# }\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\n# html_static_path = ['_static']\n"
  },
  {
    "path": "optimization/nebullvm/docs/index.rst",
    "content": "Welcome to nebullvm's documentation!\n======================================\n\n.. toctree::\n   :maxdepth: 2\n\n   modules/index\n\n"
  },
  {
    "path": "optimization/nebullvm/docs/modules/api.rst",
    "content": "nebullvm.api\n=============\n\n.. automodule:: nebullvm\n    :members:\n    \n.. automodule:: nebullvm.api.frontend.huggingface\n    :members:\n    \n"
  },
  {
    "path": "optimization/nebullvm/docs/modules/converters.rst",
    "content": "nebullvm.converters\n===================\n\n.. automodule:: nebullvm.converters\n    :members:\n"
  },
  {
    "path": "optimization/nebullvm/docs/modules/index.rst",
    "content": "API Documentation\n==================\n\n.. toctree::\n    \n    api\n    converters\n    inference_learners\n    installers\n    optimizers\n"
  },
  {
    "path": "optimization/nebullvm/docs/modules/inference_learners.rst",
    "content": "nebullvm.inference_learners\n===========================\n\n.. automodule:: nebullvm.inference_learners\n    :members:\n"
  },
  {
    "path": "optimization/nebullvm/docs/modules/installers.rst",
    "content": "nebullvm.installers\n===================\n\n.. automodule:: nebullvm.installers\n    :members:\n"
  },
  {
    "path": "optimization/nebullvm/docs/modules/optimizers.rst",
    "content": "nebullvm.optimizers\n===================\n\n.. automodule:: nebullvm.optimizers\n    :members:\n"
  },
  {
    "path": "optimization/nebullvm/docs/requirements-docs.txt",
    "content": "Sphinx==4.5.0\ncoloredlogs\nsympy\nfuro"
  },
  {
    "path": "optimization/nebullvm/nebullvm/__init__.py",
    "content": "# The torch import is necessary for a strange issue when\n# using cuda 11.8, if torch is imported after\n# tensorflow it generates a core dumped error\nfrom nebullvm.optional_modules.torch import torch  # noqa F401\nfrom nebullvm.tools.logger import setup_logger\n\nsetup_logger()\n\n__all__ = [k for k in globals().keys() if not k.startswith(\"_\")]\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/api/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/apps/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/apps/base.py",
    "content": "import abc\n\n\nclass App(abc.ABC):\n    def __init__(self):\n        super().__init__()\n\n    @abc.abstractmethod\n    def execute(self, **kwargs):\n        raise NotImplementedError()\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/config.py",
    "content": "from nebullvm.optional_modules.torch import torch\n\n\nVERSION = \"0.10.0\"\nLEARNER_METADATA_FILENAME = \"metadata.json\"\nONNX_OPSET_VERSION = 13\nNEBULLVM_DEBUG_FILE = \"nebullvm_debug.json\"\n\nAUTO_TVM_TUNING_OPTION = {\n    \"tuner\": \"xgb\",\n    \"trials\": 10,\n    \"early_stopping\": 100,\n}\n# TODO: remove the min_repeat_ms key\nAUTO_TVM_PARAMS = {\n    \"number\": 10,\n    \"repeat\": 1,\n    \"min_repeat_ms\": 0,  # since we're tuning on a CPU, can be set to 0\n    \"timeout\": 10,  # in seconds\n}\n\nNVIDIA_FILENAMES = {\n    \"engine\": \"tensor_rt.engine\",\n    \"metadata\": LEARNER_METADATA_FILENAME,\n}\n\nTVM_FILENAMES = {\"engine\": \"compiled_lib.so\"}\n\nONNX_FILENAMES = {\"model_name\": \"model.onnx\"}\nONNX_PROVIDERS = {\n    \"cuda\": [\n        \"TensorrtExecutionProvider\",\n        \"CUDAExecutionProvider\",\n        \"CPUExecutionProvider\",\n    ],\n    \"cpu\": [\n        \"CPUExecutionProvider\",\n    ],\n}\n\nOPENVINO_FILENAMES = {\n    \"metadata\": LEARNER_METADATA_FILENAME,\n    \"description_file\": \"description.xml\",\n    \"weights\": \"weights.bin\",\n}\n\nTENSORFLOW_BACKEND_FILENAMES = {\n    \"tflite_model\": \"tf_model.tflite\",\n    \"tf_model\": \"tf_model.h5\",\n}\n\nTORCH_TENSORRT_PRECISIONS = {\n    \"torch.float32\": {torch.float},\n    \"torch.float16\": {torch.float, torch.half},\n    \"torch.int8\": {torch.float, torch.half, torch.int8},\n}\n\nMIN_DIM_INPUT_DATA = 100\nQUANTIZATION_DATA_NUM = 300\nCONSTRAINED_METRIC_DROP_THS = 1e-2\nTRAIN_TEST_SPLIT_RATIO = 0.8\n\nCOMPILER_LIST = [\n    \"deepsparse\",\n    \"tensor_rt\",\n    \"torchscript\",\n    \"onnxruntime\",\n    \"tflite\",\n    \"xla\",\n    \"tvm\",\n    \"openvino\",\n    \"bladedisc\",\n    \"intel_neural_compressor\",\n    \"torch_neuron\",\n    \"torch_xla\",\n    \"torch_dynamo\",\n    \"faster_transformer\",\n]\n\nCOMPRESSOR_LIST = [\n    \"sparseml\",\n    \"intel_pruning\",\n]\n\nONNX_MODULES = [\"openvino\", \"tensor_rt\"]\n\nTORCH_MODULES = [\n    \"deepsparse\",\n    \"intel_neural_compressor\",\n    \"tensor_rt\",\n    \"torch_tensor_rt\",\n    \"faster_transformer\",\n]\n\nTENSORFLOW_MODULES = []\nHUGGING_FACE_MODULES = []\nDIFFUSERS_MODULES = []\n\nLIBRARIES_GPU = [\"tensor_rt\", \"torch_tensor_rt\", \"faster_transformer\"]\n\nMIN_NUMBER = 1e-4\nDEFAULT_METRIC_DROP_THS = 1e-2\nACTIVATION_METRIC_DROP_THS = 2e-2\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/core/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/core/models.py",
    "content": "import subprocess\nfrom dataclasses import dataclass\nfrom enum import Enum\nfrom functools import cached_property\nfrom typing import Optional, Any, Union, Tuple, List, Dict\n\nimport numpy as np\n\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\n\n\nclass DeepLearningFramework(Enum):\n    PYTORCH = \"torch\"\n    TENSORFLOW = \"tensorflow\"\n    NUMPY = \"numpy\"\n\n\nclass QuantizationType(Enum):\n    DYNAMIC = \"DYNAMIC\"\n    STATIC = \"STATIC\"\n    HALF = \"HALF\"\n\n\nclass Status(Enum):\n    OK = \"OK\"\n    ERROR = \"ERROR\"\n\n\nclass DeviceType(Enum):\n    CPU = \"cpu\"\n    GPU = \"gpu\"\n    TPU = \"tpu\"\n    NEURON = \"neuron\"\n\n\nclass DataType(str, Enum):\n    FLOAT16 = \"float16\"\n    FLOAT32 = \"float32\"\n    INT32 = \"int32\"\n    INT64 = \"int64\"\n\n    @classmethod\n    def from_framework_format(\n        cls, dtype: Union[torch.dtype, tf.dtypes.DType, np.dtype]\n    ):\n        if isinstance(dtype, torch.dtype):\n            framework = \"torch\"\n        elif isinstance(dtype, tf.dtypes.DType):\n            framework = \"tensorflow\"\n        else:\n            framework = \"numpy\"\n            dtype = dtype.type\n        return FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[framework][dtype]\n\n    def to_torch_format(self):\n        for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[\n            \"torch\"\n        ].items():\n            if value == self:\n                return key\n\n    def to_tf_format(self):\n        for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[\n            \"tensorflow\"\n        ].items():\n            if value == self:\n                return key\n\n    def to_numpy_format(self):\n        for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[\n            \"numpy\"\n        ].items():\n            if value == self:\n                return key\n\n\nclass ModelCompiler(Enum):\n    TENSOR_RT = \"tensor_rt\"\n    TENSOR_RT_ONNX = \"onnx_tensor_rt\"\n    TENSOR_RT_TORCH = \"torch_tensor_rt\"\n    OPENVINO = \"openvino\"\n    APACHE_TVM = \"tvm\"\n    APACHE_TVM_TORCH = \"torch_tvm\"\n    APACHE_TVM_ONNX = \"onnx_tvm\"\n    ONNX_RUNTIME = \"onnxruntime\"\n    DEEPSPARSE = \"deepsparse\"\n    TORCHSCRIPT = \"torchscript\"\n    XLA = \"xla\"\n    TFLITE = \"tflite\"\n    BLADEDISC = \"bladedisc\"\n    INTEL_NEURAL_COMPRESSOR = \"intel_neural_compressor\"\n    TORCH_NEURON = \"torch_neuron\"\n    TORCH_XLA = \"torch_xla\"\n    TORCH_DYNAMO = \"torch_dynamo\"\n    FASTER_TRANSFORMER = \"faster_transformer\"\n\n\nclass ModelCompressor(Enum):\n    SPARSE_ML = \"sparseml\"\n    INTEL_PRUNING = \"intel_pruning\"\n\n\nclass OptimizationTime(Enum):\n    CONSTRAINED = \"constrained\"\n    UNCONSTRAINED = \"unconstrained\"\n\n\n@dataclass\nclass HardwareSetup:\n    cpu: str\n    operating_system: str\n    memory_gb: int\n    accelerator: Optional[str] = None\n\n\n@dataclass\nclass OptimizedModel:\n    inference_learner: Any\n    latency_seconds: float\n    metric_drop: float\n    technique: str\n    compiler: str\n    throughput: float\n    size_mb: float\n\n\n@dataclass\nclass OriginalModel:\n    model: Any\n    latency_seconds: float\n    throughput: float\n    name: str\n    size_mb: float\n    framework: DeepLearningFramework\n\n\n@dataclass\nclass BenchmarkOriginalModelResult:\n    \"\"\"The result of the LatencyOriginalModelMeasureOp\"\"\"\n\n    latency_seconds: float\n    model_outputs: Any\n\n\n@dataclass\nclass OptimizeInferenceResult:\n    \"\"\"The result of the OptimizeInferenceOp\"\"\"\n\n    original_model: OriginalModel\n    hardware_setup: HardwareSetup\n    optimized_model: Optional[OptimizedModel]\n\n    @property\n    def metric_drop(self) -> Optional[float]:\n        if self.optimized_model is None:\n            return None\n        return self.optimized_model.metric_drop\n\n    @cached_property\n    def latency_improvement_rate(self) -> Optional[float]:\n        if self.optimized_model is None:\n            return None\n        if self.optimized_model.latency_seconds == 0:\n            return -1\n        return (\n            self.original_model.latency_seconds\n            / self.optimized_model.latency_seconds\n        )\n\n    @cached_property\n    def throughput_improvement_rate(self) -> Optional[float]:\n        if self.optimized_model is None:\n            return None\n        if self.original_model.throughput == 0:\n            return -1\n        return self.optimized_model.throughput / self.original_model.throughput\n\n    @cached_property\n    def size_improvement_rate(self) -> Optional[float]:\n        if self.optimized_model is None:\n            return None\n        if self.optimized_model.size_mb == 0:\n            return 1\n        return self.original_model.size_mb / self.optimized_model.size_mb\n\n\nclass InputInfo:\n    \"\"\"Class for storing all the information needed for creating an input\n    tensor for AI models.\n\n    Attributes:\n        size (tuple): Tuple with the input size (batch size excluded)\n        dtype (str): Data type of the tensor.\n        min_value (int or float, optional): Min value the tensor elements can\n            have.\n        max_value (int or float, optional): Max value the tensor elements can\n            have.\n    \"\"\"\n\n    def __init__(self, size: Tuple[int, ...], dtype: str, **extra_info):\n        self.dtype = DataType(dtype)\n        self.size = size\n        self.__dict__.update(extra_info)\n\n    def __getattr__(self, item):\n        return self.__dict__.get(item)\n\n    def dict(self):\n        return {\n            k: v for k, v in self.__dict__.items() if not k.startswith(\"_\")\n        }\n\n\n@dataclass\nclass DynamicAxisInfo:\n    inputs: List[Dict[int, str]]\n    outputs: List[Dict[int, str]]\n\n    def dict(self):\n        return {\n            k: v for k, v in self.__dict__.items() if not k.startswith(\"_\")\n        }\n\n    def retrieve_output_dim(\n        self,\n        input_shapes: List[Tuple[int, ...]],\n        output_idx: int,\n        dimension_idx: int,\n        default_output_value: int,\n    ) -> int:\n        output_tag = self.outputs[output_idx][dimension_idx]\n        for input_dict, input_shape in zip(self.inputs, input_shapes):\n            for key, value in input_dict.items():\n                if (\n                    isinstance(value, dict) and value.get(\"name\") == output_tag\n                ) or value == output_tag:\n                    return input_shape[key]\n        return default_output_value\n\n\n@dataclass\nclass ModelParams:\n    batch_size: int\n    input_infos: List[InputInfo]\n    output_sizes: List[Tuple[int, ...]]\n    output_types: List[DataType]\n    dynamic_info: Union[DynamicAxisInfo, Dict] = None\n\n    def __post_init__(self):\n        if isinstance(self.dynamic_info, dict):\n            self.dynamic_info = DynamicAxisInfo(**self.dynamic_info)\n        self.input_infos = [\n            InputInfo(**x) if isinstance(x, dict) else x\n            for x in self.input_infos\n        ]\n        self.output_types = [DataType(x) for x in self.output_types]\n\n    def dict(self):\n        def recursively_dictionarize(element):\n            if isinstance(element, list):\n                element = [recursively_dictionarize(el) for el in element]\n            elif hasattr(element, \"dict\"):\n                element = element.dict()\n            return element\n\n        return {\n            k: recursively_dictionarize(v)\n            for k, v in self.__dict__.items()\n            if not k.startswith(\"_\")\n        }\n\n    @property\n    def input_sizes(self):\n        for input_info in self.input_infos:\n            yield input_info.size\n\n\nclass Device:\n    def __init__(self, type: DeviceType, idx: int = 0):\n        self.type = type\n        self.idx = idx\n\n    @classmethod\n    def from_str(cls, string: str) -> \"Device\":\n        if string.startswith(\"cuda\") or string.startswith(\"gpu\"):\n            return cls(\n                DeviceType.GPU,\n                int(string.split(\":\")[1] if \":\" in string else 0),\n            )\n        elif string.startswith(\"tpu\"):\n            return cls(\n                DeviceType.TPU,\n                int(string.split(\":\")[1] if \":\" in string else 0),\n            )\n\n        return cls(DeviceType.CPU)\n\n    def to_torch_format(self) -> str:\n        if self.type is DeviceType.GPU:\n            return f\"cuda:{self.idx}\"\n        elif self.type is DeviceType.TPU:\n            return f\"xla:{self.idx}\"\n\n        return \"cpu\"\n\n    def to_tf_format(self) -> str:\n        if self.type is DeviceType.GPU:\n            return f\"GPU:{self.idx}\"\n\n        return \"CPU\"\n\n    def get_total_memory(self) -> int:\n        # Return total memory in bytes using nvidia-smi in bytes\n        if self.type is not DeviceType.GPU:\n            raise Exception(\"Device type must be GPU\")\n        else:\n            try:\n                output = (\n                    subprocess.check_output(\n                        \"nvidia-smi --query-gpu=memory.total \"\n                        \"--format=csv,nounits,noheader\",\n                        shell=True,\n                    )\n                    .decode(\"utf-8\")\n                    .split()[self.idx]\n                )\n                return int(output) * 1024 * 1024\n            except Exception:\n                raise Exception(\n                    \"Unable to get total memory of device. \"\n                    \"Please make sure nvidia-smi is available.\"\n                )\n\n    def get_free_memory(self) -> int:\n        # Return free memory in bytes using nvidia-smi in bytes\n        if self.type is not DeviceType.GPU:\n            raise Exception(\"Device type must be GPU\")\n        else:\n            try:\n                output = (\n                    subprocess.check_output(\n                        \"nvidia-smi --query-gpu=memory.free \"\n                        \"--format=csv,nounits,noheader\",\n                        shell=True,\n                    )\n                    .decode(\"utf-8\")\n                    .split()[self.idx]\n                )\n                return int(output) * 1024 * 1024\n            except Exception:\n                raise Exception(\n                    \"Unable to get free memory of device. \"\n                    \"Please make sure nvidia-smi is available.\"\n                )\n\n\nFRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT = {\n    \"torch\": {\n        torch.float16: DataType.FLOAT16,\n        torch.float32: DataType.FLOAT32,\n        torch.int32: DataType.INT32,\n        torch.int64: DataType.INT64,\n    },\n    \"tensorflow\": {\n        tf.float16: DataType.FLOAT16,\n        tf.float32: DataType.FLOAT32,\n        tf.int32: DataType.INT32,\n        tf.int64: DataType.INT64,\n    },\n    \"numpy\": {\n        np.float16: DataType.FLOAT16,\n        np.float32: DataType.FLOAT32,\n        np.int32: DataType.INT32,\n        np.int64: DataType.INT64,\n    },\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/core/tests/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/core/tests/test_models.py",
    "content": "import unittest\nfrom unittest.mock import MagicMock\n\nfrom nebullvm.core.models import OptimizeInferenceResult\n\n\nclass TestOptimizeInferenceResult(unittest.TestCase):\n    def test_latency_improvement_rate__optimized_model_is_none(self):\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(),\n            hardware_setup=MagicMock(),\n            optimized_model=None,\n        )\n        self.assertIsNone(res.latency_improvement_rate)\n\n    def test_latency_improvement_rate__optimized_latency_is_zero(self):\n        original_latency = 1.0\n        optimized_latency = 0.0\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(latency_seconds=original_latency),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(latency_seconds=optimized_latency),\n        )\n        self.assertEqual(-1, res.latency_improvement_rate)\n\n    def test_latency_improvement_rate__original_latency_is_zero(self):\n        original_latency = 0.0\n        optimized_latency = 1.0\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(latency_seconds=original_latency),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(latency_seconds=optimized_latency),\n        )\n        self.assertEqual(0, res.latency_improvement_rate)\n\n    def test_latency_improvement_rate__rate_gt_1(self):\n        original_latency = 1.0\n        optimized_latency = 0.5\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(latency_seconds=original_latency),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(latency_seconds=optimized_latency),\n        )\n        self.assertGreater(res.latency_improvement_rate, 1)\n\n    def test_latency_improvement_rate__rate_lt_1(self):\n        original_latency = 0.5\n        optimized_latency = 1.0\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(latency_seconds=original_latency),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(latency_seconds=optimized_latency),\n        )\n        self.assertLess(res.latency_improvement_rate, 1)\n\n    def test_th_improvement_rate__optimized_model_is_none(self):\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(),\n            hardware_setup=MagicMock(),\n            optimized_model=None,\n        )\n        self.assertIsNone(res.throughput_improvement_rate)\n\n    def test_th_improvement_rate__optimized_th_is_zero(self):\n        original_th = 1.0\n        optimized_th = 0.0\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(throughput=original_th),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(throughput=optimized_th),\n        )\n        self.assertEqual(0, res.throughput_improvement_rate)\n\n    def test_th_improvement_rate__original_th_is_zero(self):\n        original_th = 0.0\n        optimized_th = 1.0\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(throughput=original_th),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(throughput=optimized_th),\n        )\n        self.assertEqual(-1, res.throughput_improvement_rate)\n\n    def test_th_improvement_rate__rate_gt_1(self):\n        original_th = 0.5\n        optimized_th = 1\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(throughput=original_th),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(throughput=optimized_th),\n        )\n        self.assertGreater(res.throughput_improvement_rate, 1)\n\n    def test_th_improvement_rate__rate_lt_1(self):\n        original_th = 1.0\n        optimized_th = 0.5\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(throughput=original_th),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(throughput=optimized_th),\n        )\n        self.assertLess(res.throughput_improvement_rate, 1)\n\n    def test_size_improvement_rate__optimized_model_is_none(self):\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(),\n            hardware_setup=MagicMock(),\n            optimized_model=None,\n        )\n        self.assertIsNone(res.size_improvement_rate)\n\n    def test_size_improvement_rate__optimized_size_is_zero(self):\n        original_size = 1.0\n        optimized_size = 0.0\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(size_mb=original_size),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(size_mb=optimized_size),\n        )\n        self.assertEqual(1, res.size_improvement_rate)\n\n    def test_size_improvement_rate__original_size_is_zero(self):\n        original_size = 0.0\n        optimized_size = 1.0\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(size_mb=original_size),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(size_mb=optimized_size),\n        )\n        self.assertEqual(0, res.size_improvement_rate)\n\n    def test_size_improvement_rate__rate_gt_1(self):\n        original_size = 1\n        optimized_size = 0.5\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(size_mb=original_size),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(size_mb=optimized_size),\n        )\n        self.assertGreater(res.size_improvement_rate, 1)\n\n    def test_size_improvement_rate__rate_lt_1(self):\n        original_size = 0.5\n        optimized_size = 1\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(size_mb=original_size),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(size_mb=optimized_size),\n        )\n        self.assertLess(res.size_improvement_rate, 1)\n\n    def test_metric_drop__optimized_model_is_none(self):\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(),\n            hardware_setup=MagicMock(),\n            optimized_model=None,\n        )\n        self.assertIsNone(res.metric_drop)\n\n    def test_metric_drop(self):\n        metric_drop = 0.1\n        res = OptimizeInferenceResult(\n            original_model=MagicMock(),\n            hardware_setup=MagicMock(),\n            optimized_model=MagicMock(metric_drop=metric_drop),\n        )\n        self.assertEqual(metric_drop, res.metric_drop)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/core/types.py",
    "content": "from typing import Union, Iterable, Sequence\n\nfrom nebullvm.tools.data import DataManager\n\nInputData = Union[Iterable, Sequence, DataManager]\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/__init__.py",
    "content": "# flake8: noqa\n\n__all__ = [k for k in globals().keys() if not k.startswith(\"_\")]\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/auto_installer.py",
    "content": "import argparse\nfrom typing import List, Union\n\nfrom loguru import logger\n\nfrom nebullvm.config import (\n    ONNX_MODULES,\n    TENSORFLOW_MODULES,\n    TORCH_MODULES,\n    HUGGING_FACE_MODULES,\n    DIFFUSERS_MODULES,\n)\nfrom nebullvm.installers.installers import (\n    ONNXInstaller,\n    PytorchInstaller,\n    TensorflowInstaller,\n    HuggingFaceInstaller,\n    DiffusersInstaller,\n)\n\n\nSUPPORTED_BACKENDS_DICT = {\n    \"torch\": [\"onnx\"],\n    \"tensorflow\": [\"onnx\"],\n    \"huggingface\": [\"torch\", \"tensorflow\", \"onnx\"],\n    \"diffusers\": [\"torch\", \"onnx\"],\n    \"onnx\": [],\n}\n\nINSTALLERS = {\n    \"onnx\": ONNXInstaller,\n    \"torch\": PytorchInstaller,\n    \"tensorflow\": TensorflowInstaller,\n    \"huggingface\": HuggingFaceInstaller,\n    \"diffusers\": DiffusersInstaller,\n}\n\nMODULES = {\n    \"onnx\": ONNX_MODULES,\n    \"torch\": TORCH_MODULES,\n    \"tensorflow\": TENSORFLOW_MODULES,\n    \"huggingface\": HUGGING_FACE_MODULES,\n    \"diffusers\": DIFFUSERS_MODULES,\n}\n\n\ndef select_frameworks_to_install(\n    include_frameworks: Union[List[str], str],\n    include_backends: Union[List[str], str],\n) -> List[str]:\n    supported_frameworks = list(INSTALLERS.keys())\n    if isinstance(include_frameworks, str) and include_frameworks == \"all\":\n        frameworks_list = supported_frameworks\n    elif isinstance(include_frameworks, list):\n        frameworks_list = []\n        for framework in include_frameworks:\n            if framework in supported_frameworks:\n                frameworks_list.append(framework)\n            else:\n                logger.warning(f\"Framework {framework} not supported\")\n\n        if len(frameworks_list) == 0:\n            raise ValueError(\"No supported frameworks selected\")\n\n        if isinstance(include_backends, str) and include_backends == \"all\":\n            for framework in frameworks_list:\n                for backend in SUPPORTED_BACKENDS_DICT[framework]:\n                    frameworks_list.append(backend)\n        elif isinstance(include_backends, list):\n            for backend in include_backends:\n                if backend not in supported_frameworks:\n                    logger.warning(f\"Backend {backend} not supported\")\n                else:\n                    backend_supported = False\n                    for framework in frameworks_list:\n                        if backend in SUPPORTED_BACKENDS_DICT[framework]:\n                            frameworks_list.append(backend)\n                            backend_supported = True\n                            break\n                    if not backend_supported:\n                        logger.warning(\n                            f\"Backend {backend} not supported for selected \"\n                            f\"frameworks\"\n                        )\n        else:\n            raise ValueError(\"Invalid backends list\")\n    else:\n        raise ValueError(\"Invalid frameworks list\")\n\n    frameworks_list = list(set(frameworks_list))\n    frameworks_list.sort()\n\n    return frameworks_list\n\n\ndef select_compilers_to_install(\n    include_compilers: Union[List[str], str], framework_list: List[str]\n) -> List[str]:\n    compiler_list = []\n    supported_compilers = list(\n        set([item for sublist in MODULES.values() for item in sublist])\n    )\n    if isinstance(include_compilers, str) and include_compilers == \"all\":\n        compiler_list = list(\n            set(\n                [\n                    item\n                    for (fr, compilers) in MODULES.items()\n                    for item in compilers\n                    if fr in framework_list\n                ]\n            )\n        )\n    else:\n        for compiler in include_compilers:\n            if compiler not in supported_compilers:\n                logger.warning(f\"Compiler {compiler} not supported\")\n            else:\n                compiler_supported = False\n                for framework in framework_list:\n                    if compiler in MODULES[framework]:\n                        compiler_list.append(compiler)\n                        compiler_supported = True\n                        break\n                if not compiler_supported:\n                    logger.warning(\n                        f\"Compiler {compiler} not supported for selected \"\n                        f\"frameworks\"\n                    )\n\n    compiler_list = list(set(compiler_list))\n    compiler_list.sort()\n\n    return compiler_list\n\n\ndef auto_install_libraries(\n    include_frameworks: Union[List[str], str] = \"all\",\n    include_backends: Union[List[str], str] = \"all\",\n    include_compilers: Union[List[str], str] = \"all\",\n):\n    logger.info(\"Running auto install of nebullvm dependencies\")\n\n    framework_list = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    compilers_list = select_compilers_to_install(\n        include_compilers, framework_list\n    )\n\n    for framework in framework_list:\n        framework_installer = INSTALLERS[framework](MODULES[framework])\n        if not framework_installer.check_framework():\n            framework_installer.install_framework()\n        framework_installer.install_dependencies(framework_list)\n        framework_installer.install_compilers(compilers_list)\n\n\ndef main():\n    parser = argparse.ArgumentParser(\n        description=\"Auto install dl frameworks and dependencies\"\n    )\n    parser.add_argument(\n        \"-f\",\n        \"--frameworks\",\n        help=\"The base dl frameworks to be installed\",\n        default=\"all\",\n        nargs=\"+\",\n    )\n    parser.add_argument(\n        \"-b\",\n        \"--extra-backends\",\n        help=\"additional dl frameworks to be installed to \"\n        \"gain the optimal speedup\",\n        default=\"all\",\n        nargs=\"+\",\n    )\n    parser.add_argument(\n        \"-c\",\n        \"--compilers\",\n        help=\"Compilers to be installed\",\n        default=\"all\",\n        nargs=\"+\",\n    )\n    args = vars(parser.parse_args())\n\n    if len(args[\"frameworks\"]) == 1 and args[\"frameworks\"][0] == \"all\":\n        framework_list = \"all\"\n    else:\n        framework_list = args[\"frameworks\"]\n\n    if len(args[\"extra_backends\"]) == 1 and args[\"extra_backends\"][0] in [\n        \"all\",\n        \"none\",\n    ]:\n        if args[\"extra_backends\"][0] == \"all\":\n            backend_list = \"all\"\n        else:\n            backend_list = []\n    else:\n        backend_list = args[\"extra_backends\"]\n\n    if len(args[\"compilers\"]) == 1 and args[\"compilers\"][0] == \"all\":\n        compilers_list = \"all\"\n    else:\n        compilers_list = args[\"compilers\"]\n\n    auto_install_libraries(framework_list, backend_list, compilers_list)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/install_bladedisc.sh",
    "content": "#!/bin/bash\n\n# Set non interactive mode for apt-get\nexport DEBIAN_FRONTEND=noninteractive\n\nif [ ! -d \"BladeDISC\" ]\nthen\n  git clone https://github.com/alibaba/BladeDISC.git\nfi\n\ncd BladeDISC && git submodule update --init --recursive\n\n# Install bazel\nsudo apt install apt-transport-https curl gnupg\ncurl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg\nsudo mv bazel-archive-keyring.gpg /usr/share/keyrings\necho \"deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8\" | sudo tee /etc/apt/sources.list.d/bazel.list\nsudo apt update && sudo apt install bazel\nsudo apt install default-jdk\n\nif [ $1 == \"true\" ]\nthen\ncd pytorch_blade && bash ./scripts/build_pytorch_blade.sh\nelse\n  if [[ $OSTYPE == \"darwin\"* ]]\n  then\n    export TORCH_BLADE_BUILD_WITH_CUDA_SUPPORT=OFF\n    export TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.10.0+aarch64\n    cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh\n  else\n    export TORCH_BLADE_BUILD_WITH_CUDA_SUPPORT=OFF\n    export TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.8.1+cpu\n    cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh\n  fi\nfi\n\ncd ../..\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/install_fastertransformer.sh",
    "content": "#!/bin/bash\n\n# TODO: check requirements\n# https://github.com/NVIDIA/FasterTransformer/blob/main/docs/bert_guide.md\n# Requirements\n#CMake >= 3.8 for Tensorflow, CMake >= 3.13 for PyTorch\n#CUDA 11.0 or newer version\n#Python: Only verify on python 3\n#Tensorflow: Verify on 1.15, 1.13 and 1.14 should work.\n#PyTorch: Verify on 1.8.0, >= 1.5.0 should work.\n\n\n# Set non interactive mode for apt-get\nexport DEBIAN_FRONTEND=noninteractive\n\nif [[ $OSTYPE == \"darwin\"* ]]\nthen\n  echo \"MacOS is not supported for FasterTransformer\"\n  exit 1\nfi\n\nif [ ! -d \"FasterTransformer\" ]\nthen\n  git clone --recursive https://github.com/NVIDIA/FasterTransformer FasterTransformer\nfi\n\n# TODO: checkout to latest release\n\ncd FasterTransformer &&\nmkdir -p build &&\ncd build &&\ncmake -DSM=$COMPUTE_CAPABILITY -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON  .. &&\nmake -j8 &&\ntouch ../../FasterTransformer_build_success  # create a file to indicate that the build was successful\n\n# TODO: enable multi gpu if possible\n#-DBUILD_MULTI_GPU=OFF"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/install_tensor_rt.sh",
    "content": "#!/bin/bash\n\nif [[ \"$(grep '^ID_LIKE' /etc/os-release)\" == *\"centos\"* ]]\nthen\n  # Installation for centos type linux distribution\n  # Try installation with pip if fails then install from source\n  pip3 install --upgrade \"setuptools<=65.7.0\" pip\n  # If cuda version is less than 12.0 then install tensorrt<=8.5.3.1\n  if [[ $(nvidia-smi | grep CUDA | awk '{print $9}' | cut -d '.' -f 1) -lt 12 ]]\n  then\n    python3 -m pip install --upgrade \"tensorrt<=8.5.3.1\"\n  else\n    python3 -m pip install --upgrade \"tensorrt<=8.6.1\"\n  fi\n  pip3 install colored polygraphy --extra-index-url https://pypi.ngc.nvidia.com\n\n  if [[ $(python3 -c \"import tensorrt; print(tensorrt.__version__); assert tensorrt.Builder(tensorrt.Logger())\" || echo 1) == 1 ]]\n  then\n    # Uninstall previous version\n    pip3 uninstall nvidia-tensorrt\n    # install pre-requisites\n    pip3 install numpy\n    yum update && \\\n      yum -y install glibnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev \\\n      libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer && \\\n      rm -rf /var/lib/apt/lists/*\n  fi\nelse\n  # Try installation with pip if fails then install from source\n  pip install --upgrade \"setuptools<=65.7.0\" pip\n  # If cuda version is less than 12.0 then install tensorrt<=8.5.3.1\n   if [[ $(nvidia-smi | grep CUDA | awk '{print $9}' | cut -d '.' -f 1) -lt 12 ]]\n  then\n    python3 -m pip install --upgrade \"tensorrt<=8.5.3.1\"\n  else\n    python3 -m pip install --upgrade \"tensorrt<=8.6.1\"\n  fi\n\n  pip install colored polygraphy --extra-index-url https://pypi.ngc.nvidia.com\n\n  if [[ $(python3 -c \"import tensorrt; print(tensorrt.__version__); assert tensorrt.Builder(tensorrt.Logger())\" || echo 1) == 1 ]]\n  then\n    # Uninstall previous version\n    pip uninstall nvidia-tensorrt\n    # install pre-requisites\n    pip install numpy\n    apt-get update && \\\n      apt-get -y install glibnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev \\\n      libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer && \\\n      rm -rf /var/lib/apt/lists/*\n  fi\nfi\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/install_tvm.sh",
    "content": "#!/bin/bash\n\n# Set non interactive mode for apt-get\nexport DEBIAN_FRONTEND=noninteractive\n\nif [ ! -d \"tvm\" ]\nthen\n  git clone --recursive https://github.com/apache/tvm tvm\nfi\n\ncd tvm\nmkdir -p build\ncp $CONFIG_PATH build/\ncd build\ncmake ..\nmake -j8\nif [[ $OSTYPE == \"darwin\"* ]]\nthen\n  pip install tornado\n  brew install openblas gfortran\n  pip install pybind11 cython pythran\n  conda install -y scipy\n  pip install xgboost decorator\n  export MACOSX_DEPLOYMENT_TARGET=10.9\nelse\n  pip3 install decorator attrs tornado psutil xgboost cloudpickle\nfi\ncd ../python\npython3 setup.py install --user\ncd ../..\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/install_tvm_prerequisites.sh",
    "content": "#!/bin/bash\n\n# Set non interactive mode for apt-get\nexport DEBIAN_FRONTEND=noninteractive\n\nif [[ $OSTYPE == \"darwin\"* ]]\nthen\n  brew install gcc git cmake\n  #brew install llvm\n  conda install -y -c conda-forge clangdev\nelif [[ \"$(grep '^ID_LIKE' /etc/os-release)\" == *\"centos\"* ]]\nthen\n  sudo yum update -y && sudo yum install -y gcc gcc-c++ llvm-devel cmake3 git\n  if [ -f \"/usr/bin/cmake\" ]\n  then\n    sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 \\\n      --slave /usr/local/bin/ctest ctest /usr/bin/ctest \\\n      --slave /usr/local/bin/cpack cpack /usr/bin/cpack \\\n      --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake \\\n      --family cmake\n    sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 \\\n      --slave /usr/local/bin/ctest ctest /usr/bin/ctest3 \\\n      --slave /usr/local/bin/cpack cpack /usr/bin/cpack3 \\\n      --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3 \\\n      --family cmake\n  else\n    sudo ln -s /usr/bin/cmake3 /usr/bin/cmake\n  fi\nelse\n  sudo apt-get update && sudo apt-get install -y libpython3.8 gcc libtinfo-dev zlib1g-dev \\\n    build-essential cmake libedit-dev libxml2-dev llvm-12\nfi\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/installers.py",
    "content": "import os\nimport platform\nimport subprocess\nimport sys\nfrom abc import ABC\nfrom pathlib import Path\nfrom typing import List\n\nimport cpuinfo\nfrom loguru import logger\n\nfrom nebullvm.config import LIBRARIES_GPU\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    deepsparse_is_available,\n    get_faster_transformer_repo_path,\n    intel_neural_compressor_is_available,\n    openvino_is_available,\n    tensorrt_is_available,\n    torch_tensorrt_is_available,\n)\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.utils import check_module_version, gpu_is_available\n\n\ndef get_cpu_arch():\n    arch = cpuinfo.get_cpu_info()[\"arch\"].lower()\n    if \"x86\" in arch:\n        return \"x86\"\n    else:\n        return \"arm\"\n\n\ndef _get_os():\n    return platform.system()\n\n\ndef install_tvm(\n    working_dir: str = None,\n):\n    \"\"\"Helper function for installing ApacheTVM.\n\n    This function needs some prerequisites for running, as a valid `git`\n    installation and having MacOS or a Linux-distribution as OS.\n\n    Args:\n        working_dir (str, optional): The directory where the tvm repo will be\n            cloned and installed.\n    \"\"\"\n    path = Path(__file__).parent\n    # install pre-requisites\n    installation_file_prerequisites = str(\n        path / \"install_tvm_prerequisites.sh\"\n    )\n    subprocess.run(\n        [\"bash\", installation_file_prerequisites],\n        cwd=working_dir or Path.home(),\n    )\n    installation_file = str(path / \"install_tvm.sh\")\n    hardware_config = get_cpu_arch()\n    if gpu_is_available():\n        hardware_config = f\"{hardware_config}_cuda\"\n    env_dict = {\n        \"CONFIG_PATH\": str(\n            path / f\"tvm_installers/{hardware_config}/config.cmake\"\n        ),\n        **dict(os.environ.copy()),\n    }\n    subprocess.run(\n        [\"bash\", installation_file],\n        cwd=working_dir or Path.home(),\n        env=env_dict,\n    )\n\n    try:\n        import tvm  # noqa F401\n    except ImportError:\n        return True\n\n    return True\n\n\ndef install_bladedisc():\n    \"\"\"Helper function for installing BladeDisc.\"\"\"\n    has_cuda = False\n    if gpu_is_available():\n        has_cuda = True\n\n    path = Path(__file__).parent\n    installation_file = str(path / \"install_bladedisc.sh\")\n    subprocess.Popen([\"bash\", installation_file, str(has_cuda).lower()])\n\n    try:\n        import torch_blade  # noqa F401\n    except ImportError:\n        return False\n\n    return True\n\n\ndef install_torch_tensor_rt():\n    \"\"\"Helper function for installing Torch-TensorRT.\n\n    The function will install the software only if a cuda driver is available.\n    \"\"\"\n    if not gpu_is_available():\n        raise RuntimeError(\n            \"Torch-TensorRT can run just on Nvidia machines. \"\n            \"No available cuda driver has been found.\"\n        )\n    elif not check_module_version(\n        torch, min_version=\"1.12.0\", max_version=\"1.13.1+cu117\"\n    ):\n        logger.warning(\n            \"Torch-TensorRT can be installed only for \"\n            \"'PyTorch>=1.12, <=1.13.1'. Please update your Pytorch \"\n            \"version accordingly if you want to use Torch-TensorRT.\"\n        )\n        return False\n\n    # Verify that TensorRT is installed, otherwise install it\n    try:\n        import tensorrt  # noqa F401\n    except ImportError:\n        install_tensor_rt()\n\n    cmd = [\n        \"pip3\",\n        \"install\",\n        \"torch-tensorrt\",\n        \"--find-links\",\n        \"https://github.com/pytorch/TensorRT/releases/expanded_assets/v1.3.0\",\n    ]\n    subprocess.run(cmd)\n    cuda_version = subprocess.check_output([\"nvidia-smi\"])\n    cuda_version = int(\n        cuda_version.decode(\"utf-8\")\n        .split(\"\\n\")[2]\n        .split(\"|\")[-2]\n        .split(\":\")[-1]\n        .strip()\n        .split(\".\")[0]\n    )\n    if cuda_version >= 12:\n        cmd = [\n            \"pip3\",\n            \"install\",\n            \"tensorrt>=8.6.0,<=8.6.1\",\n        ]\n        subprocess.run(cmd)\n\n    try:\n        import torch_tensorrt  # noqa F401\n    except ImportError:\n        return False\n\n    return True\n\n\ndef install_tf2onnx():\n    if _get_os() == \"Darwin\" and get_cpu_arch() == \"arm\":\n        cmd = [\"conda\", \"install\", \"-y\", \"tf2onnx>=1.8.4\"]\n        subprocess.run(cmd)\n    else:\n        cmd = [\"pip3\", \"install\", \"--user\", \"protobuf<4,>=3.20.2\"]\n        subprocess.run(cmd)\n\n        cmd = [\"pip3\", \"install\", \"tf2onnx>=1.8.4\"]\n        subprocess.run(cmd)\n\n    try:\n        import tf2onnx  # noqa F401\n    except ImportError:\n        return False\n    except AttributeError:\n        # Sometimes the import could raise an attribute error\n        # if installation fails\n        pass\n\n    return True\n\n\ndef install_tensor_rt():\n    \"\"\"Helper function for installing TensorRT.\n\n    The function will install the software only if a cuda driver is available.\n    \"\"\"\n    if not gpu_is_available():\n        raise RuntimeError(\n            \"TensorRT can run just on Nvidia machines. \"\n            \"No available cuda driver has been found.\"\n        )\n    path = Path(__file__).parent\n    installation_file = str(path / \"install_tensor_rt.sh\")\n    subprocess.run([\"bash\", installation_file])\n\n    try:\n        import polygraphy  # noqa F401\n        import tensorrt  # noqa F401\n    except ImportError:\n        return False\n\n    return True\n\n\ndef install_openvino(with_optimization: bool = True):\n    \"\"\"Helper function for installing the OpenVino compiler.\n\n    This function just works on intel machines.\n\n    Args:\n        with_optimization (bool): Flag for installing the full openvino engine\n            or limiting the installation to the tools need for inference\n            models.\n    \"\"\"\n    processor = cpuinfo.get_cpu_info()[\"brand_raw\"].lower()\n    if \"intel\" not in processor:\n        raise RuntimeError(\n            f\"Openvino can run just on Intel machines. \"\n            f\"You are trying to install it on {processor}\"\n        )\n\n    openvino_version = \"openvino-dev\" if with_optimization else \"openvino\"\n    # If on windows\n    if _get_os() == \"Windows\":\n        cmd = [\"pip3\", \"install\", \"--user\", f\"{openvino_version}>=2022.1.0\"]\n    else:\n        cmd = [\"pip3\", \"install\", f\"{openvino_version}>=2022.1.0\"]\n    subprocess.run(cmd)\n\n    cmd = [\"pip3\", \"install\", \"scipy>=1.7.3\"]\n    subprocess.run(cmd)\n\n    try:\n        from openvino.runtime import (  # noqa F401\n            CompiledModel,\n            Core,\n            InferRequest,\n            Model,\n        )\n    except ImportError:\n        return False\n\n    return True\n\n\ndef install_onnxruntime():\n    \"\"\"Helper function for installing the right version of onnxruntime.\"\"\"\n    distribution_name = \"onnxruntime\"\n    if gpu_is_available():\n        distribution_name = f\"{distribution_name}-gpu\"\n    if _get_os() == \"Darwin\" and get_cpu_arch() == \"arm\":\n        cmd = [\"conda\", \"install\", \"-y\", distribution_name]\n    else:\n        cmd = [\"pip3\", \"install\", distribution_name]\n    subprocess.run(cmd)\n    # install requirements for onnxruntime.transformers\n    cmd = [\"pip3\", \"install\", \"coloredlogs\", \"sympy\"]\n    subprocess.run(cmd)\n\n    try:\n        import onnxruntime  # noqa F401\n    except ImportError:\n        return False\n\n    return True\n\n\ndef install_deepsparse():\n    \"\"\"Helper function for installing DeepSparse.\"\"\"\n    python_minor_version = sys.version_info.minor\n\n    os_ = platform.system()\n    if os_ in [\"Darwin\", \"Windows\"] or get_cpu_arch() == \"arm\":\n        raise RuntimeError(\n            \"DeepSparse is not supported on this platform. \"\n            \"It won't be installed.\"\n        )\n\n    try:\n        cmd = [\"apt-get\", \"install\", f\"python3.{python_minor_version}-venv\"]\n        subprocess.run(cmd)\n    except Exception:\n        pass\n\n    cmd = [\"pip3\", \"install\", \"deepsparse\"]\n    subprocess.run(cmd)\n\n    try:\n        cmd = [\"pip3\", \"install\", \"numpy>=1.22.0,<1.24.0\"]\n        subprocess.run(cmd)\n    except Exception:\n        # For python 3.7 numpy 1.22.0 is not available\n        pass\n\n    try:\n        from deepsparse import compile_model, cpu  # noqa F401\n    except ImportError:\n        return False\n\n    return True\n\n\ndef install_intel_neural_compressor():\n    \"\"\"Helper function for installing Intel Neural Compressor.\"\"\"\n\n    processor = cpuinfo.get_cpu_info()[\"brand_raw\"].lower()\n    if \"intel\" not in processor:\n        raise RuntimeError(\n            f\"Intel Neural Compressor can run just on Intel machines. \"\n            f\"You are trying to install it on {processor}\"\n        )\n\n    cmd = [\"pip3\", \"install\", \"--user\", \"neural-compressor\"]\n    subprocess.run(cmd)\n\n    try:\n        from neural_compressor.experimental import (  # noqa F401\n            MixedPrecision,\n            Quantization,\n        )\n    except ImportError:\n        return False\n\n    return True\n\n\ndef install_onnx_simplifier():\n    \"\"\"Helper function for installing ONNX simplifier.\"\"\"\n\n    if get_cpu_arch() != \"arm\":\n        # Install onnx simplifier\n        cmd = [\"pip3\", \"install\", \"onnxsim\"]\n        subprocess.run(cmd)\n\n    try:\n        import onnxsim  # noqa F401\n    except ImportError:\n        return False\n\n    return True\n\n\ndef install_faster_transformer(\n    working_dir: str = None,\n):\n    \"\"\"Helper function for installing FasterTransformer.\n    https://github.com/NVIDIA/FasterTransformer\n\n    This function needs some prerequisites for running, as a valid `git`\n    installation and having MacOS or a Linux-distribution as OS.\n\n    Args:\n        working_dir (str, optional): The directory where the FasterTransformer\n        repo will be cloned and installed. Default: None\n    \"\"\"\n    if not gpu_is_available():\n        return False\n    path = Path(__file__).parent\n    # install faster transformer\n    try:\n        import torch\n\n        CP = compute_capability = torch.cuda.get_device_capability()\n        assert len(compute_capability) == 2\n    except (ImportError, AssertionError):\n        return False\n    installation_file = str(path / \"install_fastertransformer.sh\")\n    env_dict = {\n        \"COMPUTE_CAPABILITY\": f\"{CP[0]}{CP[1]}\",\n        **dict(os.environ.copy()),\n    }\n\n    result = subprocess.run(\n        [\"bash\", installation_file],\n        cwd=get_faster_transformer_repo_path().parent,\n        env=env_dict,\n    )\n    # check result\n    if result.returncode != 0:\n        return False\n    return True\n\n\nclass BaseInstaller(ABC):\n    def __init__(self, module_list: List[str]):\n        self.modules = module_list\n\n    def install_compilers(\n        self,\n        include_libraries: List[str],\n    ):\n        for library in self.modules:\n            if (\n                isinstance(include_libraries, List)\n                and library not in include_libraries\n            ) or (not gpu_is_available() and library in LIBRARIES_GPU):\n                continue\n\n            logger.info(f\"Trying to install {library} on the platform...\")\n\n            try:\n                if not COMPILERS_AVAILABLE[library]():\n                    install_ok = COMPILER_INSTALLERS[library]()\n                else:\n                    install_ok = True\n            except Exception:\n                install_ok = False\n\n            if not install_ok:\n                logger.warning(\n                    f\"Unable to install {library} on this platform. \"\n                    f\"The compiler will be skipped. \"\n                )\n            else:\n                logger.info(f\"{library} installed successfully!\")\n\n    @staticmethod\n    def install_dependencies(include_framework: List[str]):\n        raise NotImplementedError\n\n    @staticmethod\n    def check_framework():\n        raise NotImplementedError\n\n    @staticmethod\n    def install_framework():\n        raise NotImplementedError\n\n\nclass PytorchInstaller(BaseInstaller):\n    @staticmethod\n    def install_dependencies(include_framework: List[str]):\n        return\n\n    @staticmethod\n    def check_framework():\n        try:\n            import torch  # noqa F401\n        except ImportError:\n            raise ImportError(\n                \"No PyTorch found in your python environment. Please install \"\n                \"it from https://pytorch.org/get-started/locally/.\"\n            )\n\n        if not check_module_version(\n            torch, min_version=\"1.12.0\", max_version=\"2.0.1+cu118\"\n        ):\n            logger.warning(\n                \"PyTorch version is not supported. Please install \"\n                \"PyTorch >= 1.12.0 and <= 2.0.1.\"\n            )\n\n        return True\n\n    @staticmethod\n    def install_framework():\n        cmd = [\"pip3\", \"install\", \"torch>=1.12.0, <=2.0.1\"]\n        subprocess.run(cmd)\n\n        try:\n            import torch  # noqa F401\n        except ImportError:\n            return False\n\n        return True\n\n\nclass TensorflowInstaller(BaseInstaller):\n    @staticmethod\n    def install_dependencies(include_framework: List[str]):\n        if \"onnx\" in include_framework:\n            install_tf2onnx()\n\n    @staticmethod\n    def check_framework():\n        try:\n            import tensorflow  # noqa F401\n        except ImportError:\n            return False\n\n        if not check_module_version(\n            tensorflow, min_version=\"2.7.0\", max_version=\"2.12.0\"\n        ):\n            logger.warning(\n                \"TensorFlow version is not supported. Please install \"\n                \"TensorFlow >= 2.7.0 and <= 2.12.0.\"\n            )\n            return False\n\n        return True\n\n    @staticmethod\n    def install_framework():\n        if _get_os() == \"Darwin\" and get_cpu_arch() == \"arm\":\n            cmd = [\n                \"conda\",\n                \"install\",\n                \"-y\",\n                \"tensorflow>=2.7.0, 2.12.0\",\n                \"numpy<1.24\",\n            ]\n            subprocess.run(cmd)\n        else:\n            cmd = [\"pip3\", \"install\", \"--user\", \"tensorflow>=2.7.0, <=2.12.0\"]\n            subprocess.run(cmd)\n\n        try:\n            import tensorflow  # noqa F401\n        except ImportError:\n            return False\n\n        return True\n\n\nclass ONNXInstaller(BaseInstaller):\n    @staticmethod\n    def install_dependencies(include_framework: List[str]):\n        install_onnxruntime()\n        cmd = [\"pip3\", \"install\", \"onnxmltools>=1.11.0\"]\n        subprocess.run(cmd)\n        install_onnx_simplifier()\n\n    @staticmethod\n    def check_framework():\n        try:\n            import onnx  # noqa F401\n        except ImportError:\n            return False\n\n        if not check_module_version(\n            onnx, min_version=\"1.10.0\", max_version=\"1.14.0\"\n        ):\n            logger.warning(\n                \"ONNX version is not supported. Please install \"\n                \"ONNX >= 1.10.0 and <= 1.14.0.\"\n            )\n            return False\n\n        return True\n\n    @staticmethod\n    def install_framework():\n        if _get_os() == \"Darwin\" and get_cpu_arch() == \"arm\":\n            cmd = [\"pip3\", \"install\", \"cmake\"]\n            subprocess.run(cmd)\n\n        cmd = [\"pip3\", \"install\", \"onnx>=1.10.0, <=1.14.0\"]\n        subprocess.run(cmd)\n\n        try:\n            import onnx  # noqa F401\n        except ImportError:\n            return False\n\n        return True\n\n\nclass HuggingFaceInstaller(BaseInstaller):\n    @staticmethod\n    def install_dependencies(include_framework: List[str]):\n        pass\n\n    @staticmethod\n    def check_framework():\n        try:\n            import transformers  # noqa F401\n        except ImportError:\n            return False\n\n        return True\n\n    @staticmethod\n    def install_framework():\n        cmd = [\"pip3\", \"install\", \"transformers<=4.28.0\"]\n        subprocess.run(cmd)\n\n        try:\n            import transformers  # noqa F401\n        except ImportError:\n            return False\n\n        return True\n\n\nclass DiffusersInstaller(BaseInstaller):\n    @staticmethod\n    def install_dependencies(include_framework: List[str]):\n        cmd = [\"pip3\", \"install\", \"transformers<=4.28.0\"]\n        subprocess.run(cmd)\n\n        if gpu_is_available():\n            cmd = [\"pip3\", \"install\", \"cuda-python\"]\n            subprocess.run(cmd)\n\n            cmd = [\"pip3\", \"install\", \"onnx>=1.10.0, <=1.14.0\"]\n            subprocess.run(cmd)\n\n            cmd = [\n                \"pip3\",\n                \"install\",\n                \"onnx_graphsurgeon\",\n                \"--index-url\",\n                \"https://pypi.ngc.nvidia.com\",\n            ]\n            subprocess.run(cmd)\n\n    @staticmethod\n    def check_framework():\n        try:\n            import diffusers  # noqa F401\n        except ImportError:\n            return False\n\n        if not check_module_version(diffusers, min_version=\"0.13.0\"):\n            return False\n\n        return True\n\n    @staticmethod\n    def install_framework():\n        cmd = [\"pip3\", \"install\", \"diffusers>=0.13.0, <=0.15.0\"]\n        subprocess.run(cmd)\n\n        try:\n            import diffusers  # noqa F401\n        except ImportError:\n            return False\n\n        return True\n\n\nCOMPILER_INSTALLERS = {\n    \"openvino\": install_openvino,\n    \"tensor_rt\": install_tensor_rt,\n    \"torch_tensor_rt\": install_torch_tensor_rt,\n    \"deepsparse\": install_deepsparse,\n    \"intel_neural_compressor\": install_intel_neural_compressor,\n    # \"faster_transformer\": install_faster_transformer,\n}\n\nCOMPILERS_AVAILABLE = {\n    \"openvino\": openvino_is_available,\n    \"tensor_rt\": tensorrt_is_available,\n    \"torch_tensor_rt\": torch_tensorrt_is_available,\n    \"deepsparse\": deepsparse_is_available,\n    \"intel_neural_compressor\": intel_neural_compressor_is_available,\n    # \"faster_transformer\": faster_transformer_is_available,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/tests/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/tests/test_install_frameworks.py",
    "content": "from nebullvm.installers.auto_installer import (\n    select_frameworks_to_install,\n    select_compilers_to_install,\n)\n\n\ndef test_install_default_option():\n    include_frameworks = \"all\"\n    include_backends = \"all\"\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\n        \"diffusers\",\n        \"huggingface\",\n        \"onnx\",\n        \"tensorflow\",\n        \"torch\",\n    ]\n\n\ndef test_install_torch_full():\n    include_frameworks = [\"torch\"]\n    include_backends = \"all\"\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"onnx\", \"torch\"]\n\n\ndef test_install_torch_base():\n    include_frameworks = [\"torch\"]\n    include_backends = []\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"torch\"]\n\n\ndef test_install_tensorflow_full():\n    include_frameworks = [\"tensorflow\"]\n    include_backends = \"all\"\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"onnx\", \"tensorflow\"]\n\n\ndef test_install_tensorflow_base():\n    include_frameworks = [\"tensorflow\"]\n    include_backends = []\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"tensorflow\"]\n\n\ndef test_install_onnx_full():\n    include_frameworks = [\"onnx\"]\n    include_backends = \"all\"\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"onnx\"]\n\n\ndef test_install_onnx_base():\n    include_frameworks = [\"onnx\"]\n    include_backends = []\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"onnx\"]\n\n\ndef test_install_diffusers_full():\n    include_frameworks = [\"diffusers\"]\n    include_backends = \"all\"\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"diffusers\", \"onnx\", \"torch\"]\n\n\ndef test_install_huggingface_full():\n    include_frameworks = [\"huggingface\"]\n    include_backends = \"all\"\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"huggingface\", \"onnx\", \"tensorflow\", \"torch\"]\n\n\ndef test_install_huggingface_full_tf():\n    include_frameworks = [\"huggingface\"]\n    include_backends = [\"onnx\", \"tensorflow\"]\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"huggingface\", \"onnx\", \"tensorflow\"]\n\n\ndef test_install_huggingface_full_torch():\n    include_frameworks = [\"huggingface\"]\n    include_backends = [\"onnx\", \"torch\"]\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"huggingface\", \"onnx\", \"torch\"]\n\n\ndef test_install_huggingface_tf():\n    include_frameworks = [\"huggingface\"]\n    include_backends = [\"tensorflow\"]\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"huggingface\", \"tensorflow\"]\n\n\ndef test_install_huggingface_torch():\n    include_frameworks = [\"huggingface\"]\n    include_backends = [\"torch\"]\n\n    include_backends = select_frameworks_to_install(\n        include_frameworks, include_backends\n    )\n\n    assert include_backends == [\"huggingface\", \"torch\"]\n\n\ndef test_install_huggingface_compilers_all():\n    framework_list = [\"huggingface\"]\n    include_compilers = \"all\"\n\n    compiler_list = select_compilers_to_install(\n        include_compilers, framework_list\n    )\n\n    assert compiler_list == []\n\n\ndef test_install_huggingface_torch_compilers_all():\n    framework_list = [\"huggingface\", \"torch\"]\n    include_compilers = \"all\"\n\n    compiler_list = select_compilers_to_install(\n        include_compilers, framework_list\n    )\n\n    assert compiler_list == [\n        \"deepsparse\",\n        \"faster_transformer\",\n        \"intel_neural_compressor\",\n        \"tensor_rt\",\n        \"torch_tensor_rt\",\n    ]\n\n\ndef test_install_torch_compilers_all():\n    framework_list = [\"torch\"]\n    include_compilers = \"all\"\n\n    compiler_list = select_compilers_to_install(\n        include_compilers, framework_list\n    )\n\n    assert compiler_list == [\n        \"deepsparse\",\n        \"faster_transformer\",\n        \"intel_neural_compressor\",\n        \"tensor_rt\",\n        \"torch_tensor_rt\",\n    ]\n\n\ndef test_install_torch_compilers_deepsparse():\n    framework_list = [\"torch\"]\n    include_compilers = [\"deepsparse\"]\n\n    compiler_list = select_compilers_to_install(\n        include_compilers, framework_list\n    )\n\n    assert compiler_list == [\"deepsparse\"]\n\n\ndef test_install_torch_compilers_invalid():\n    framework_list = [\"torch\"]\n    include_compilers = [\"best_compiler\"]\n\n    compiler_list = select_compilers_to_install(\n        include_compilers, framework_list\n    )\n\n    assert compiler_list == []\n\n\ndef test_install_torch_onnx_compilers_all():\n    framework_list = [\"torch\", \"onnx\"]\n    include_compilers = \"all\"\n\n    compiler_list = select_compilers_to_install(\n        include_compilers, framework_list\n    )\n\n    assert compiler_list == [\n        \"deepsparse\",\n        \"faster_transformer\",\n        \"intel_neural_compressor\",\n        \"openvino\",\n        \"tensor_rt\",\n        \"torch_tensor_rt\",\n    ]\n\n\ndef test_install_tensorflow_compilers_all():\n    framework_list = [\"tensorflow\"]\n    include_compilers = \"all\"\n\n    compiler_list = select_compilers_to_install(\n        include_compilers, framework_list\n    )\n\n    assert compiler_list == []\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/tvm_installers/arm/config.cmake",
    "content": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# \"License\"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n#   http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\n\n#--------------------------------------------------------------------\n#  Template custom cmake configuration for compiling\n#\n#  This file is used to override the build options in build.\n#  If you want to change the configuration, please use the following\n#  steps. Assume you are on the root directory. First copy the this\n#  file so that any local changes will be ignored by git\n#\n#  $ mkdir build\n#  $ cp cmake/config.cmake build\n#\n#  Next modify the according entries, and then compile by\n#\n#  $ cd build\n#  $ cmake ..\n#\n#  Then build in parallel with 8 threads\n#\n#  $ make -j8\n#--------------------------------------------------------------------\n\n#---------------------------------------------\n# Backend runtimes.\n#---------------------------------------------\n\n# Whether enable CUDA during compile,\n#\n# Possible values:\n# - ON: enable CUDA with cmake's auto search\n# - OFF: disable CUDA\n# - /path/to/cuda: use specific path to cuda toolkit\nset(USE_CUDA OFF)\n\n# Whether enable ROCM runtime\n#\n# Possible values:\n# - ON: enable ROCM with cmake's auto search\n# - OFF: disable ROCM\n# - /path/to/rocm: use specific path to rocm\nset(USE_ROCM OFF)\n\n# Whether enable SDAccel runtime\nset(USE_SDACCEL OFF)\n\n# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime\nset(USE_AOCL OFF)\n\n# Whether enable OpenCL runtime\n#\n# Possible values:\n# - ON: enable OpenCL with cmake's auto search\n# - OFF: disable OpenCL\n# - /path/to/opencl-sdk: use specific path to opencl-sdk\nset(USE_OPENCL OFF)\n\n# Whether enable Metal runtime\nset(USE_METAL OFF)\n\n# Whether enable Vulkan runtime\n#\n# Possible values:\n# - ON: enable Vulkan with cmake's auto search\n# - OFF: disable vulkan\n# - /path/to/vulkan-sdk: use specific path to vulkan-sdk\nset(USE_VULKAN OFF)\n\n# Whether enable OpenGL runtime\nset(USE_OPENGL OFF)\n\n# Whether enable MicroTVM runtime\nset(USE_MICRO OFF)\n\n# Whether enable RPC runtime\nset(USE_RPC ON)\n\n# Whether to build the C++ RPC server binary\nset(USE_CPP_RPC OFF)\n\n# Whether to build the iOS RPC server application\nset(USE_IOS_RPC OFF)\n\n# Whether embed stackvm into the runtime\nset(USE_STACKVM_RUNTIME OFF)\n\n# Whether enable tiny embedded graph executor.\nset(USE_GRAPH_EXECUTOR ON)\n\n# Whether enable tiny graph executor with CUDA Graph\nset(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)\n\n# Whether enable pipeline executor.\nset(USE_PIPELINE_EXECUTOR OFF)\n\n# Whether to enable the profiler for the graph executor and vm\nset(USE_PROFILER ON)\n\n# Whether enable microTVM standalone runtime\nset(USE_MICRO_STANDALONE_RUNTIME OFF)\n\n# Whether build with LLVM support\n# Requires LLVM version >= 4.0\n#\n# Possible values:\n# - ON: enable llvm with cmake's find search\n# - OFF: disable llvm, note this will disable CPU codegen\n#        which is needed for most cases\n# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.\nset(USE_LLVM ON)\n\n#---------------------------------------------\n# Contrib libraries\n#---------------------------------------------\n# Whether to build with BYODT software emulated posit custom datatype\n#\n# Possible values:\n# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH\n# - OFF: disable BYODT posit\n#\n# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON\nset(USE_BYODT_POSIT OFF)\n\n# Whether use BLAS, choices: openblas, atlas, apple\nset(USE_BLAS none)\n\n# Whether to use MKL\n# Possible values:\n# - ON: Enable MKL\n# - /path/to/mkl: mkl root path\n# - OFF: Disable MKL\n# set(USE_MKL /opt/intel/mkl) for UNIX\n# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32\n# set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`\nset(USE_MKL OFF)\n\n# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library\nset(USE_MKLDNN OFF)\n\n# Whether use OpenMP thread pool, choices: gnu, intel\n# Note: \"gnu\" uses gomp library, \"intel\" uses iomp5 library\nset(USE_OPENMP none)\n\n# Whether use contrib.random in runtime\nset(USE_RANDOM ON)\n\n# Whether use NNPack\nset(USE_NNPACK OFF)\n\n# Possible values:\n# - ON: enable tflite with cmake's find search\n# - OFF: disable tflite\n# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library\nset(USE_TFLITE OFF)\n\n# /path/to/tensorflow: tensorflow root path when use tflite library\nset(USE_TENSORFLOW_PATH none)\n\n# Required for full builds with TFLite. Not needed for runtime with TFLite.\n# /path/to/flatbuffers: flatbuffers root path when using tflite library\nset(USE_FLATBUFFERS_PATH none)\n\n# Possible values:\n# - OFF: disable tflite support for edgetpu\n# - /path/to/edgetpu: use specific path to edgetpu library\nset(USE_EDGETPU OFF)\n\n# Possible values:\n# - ON: enable cuDNN with cmake's auto search in CUDA directory\n# - OFF: disable cuDNN\n# - /path/to/cudnn: use specific path to cuDNN path\nset(USE_CUDNN OFF)\n\n# Whether use cuBLAS\nset(USE_CUBLAS OFF)\n\n# Whether use MIOpen\nset(USE_MIOPEN OFF)\n\n# Whether use MPS\nset(USE_MPS OFF)\n\n# Whether use rocBlas\nset(USE_ROCBLAS OFF)\n\n# Whether use contrib sort\nset(USE_SORT ON)\n\n# Whether use MKL-DNN (DNNL) codegen\nset(USE_DNNL_CODEGEN OFF)\n\n# Whether to use Arm Compute Library (ACL) codegen\n# We provide 2 separate flags since we cannot build the ACL runtime on x86.\n# This is useful for cases where you want to cross-compile a relay graph\n# on x86 then run on AArch.\n#\n# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.\n#\n# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported\n#                       operators to Arm Compute Library. OFF/ON\n# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL\n#                                     runtime. OFF/ON/\"path/to/ACL\"\nset(USE_ARM_COMPUTE_LIB OFF)\nset(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)\n\n# Whether to build with Arm Ethos-N support\n# Possible values:\n# - OFF: disable Arm Ethos-N support\n# - path/to/arm-ethos-N-stack: use a specific version of the\n#   Ethos-N driver stack\nset(USE_ETHOSN OFF)\n# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine\n# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure\nset(USE_ETHOSN_HW OFF)\n\n# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support\nset(USE_ETHOSU OFF)\n\n# Whether to build with TensorRT codegen or runtime\n# Examples are available here: docs/deploy/tensorrt.rst.\n#\n# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are\n#                        offloaded to TensorRT. OFF/ON\n# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of\n#                        TensorRT library. OFF/ON/\"path/to/TensorRT\"\nset(USE_TENSORRT_CODEGEN OFF)\nset(USE_TENSORRT_RUNTIME OFF)\n\n# Whether use VITIS-AI codegen\nset(USE_VITIS_AI OFF)\n\n# Build Verilator codegen and runtime\nset(USE_VERILATOR OFF)\n\n# Build ANTLR parser for Relay text format\n# Possible values:\n# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)\n# - OFF: disable ANTLR\n# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file\nset(USE_ANTLR OFF)\n\n# Whether use Relay debug mode\nset(USE_RELAY_DEBUG OFF)\n\n# Whether to build fast VTA simulator driver\nset(USE_VTA_FSIM OFF)\n\n# Whether to build cycle-accurate VTA simulator driver\nset(USE_VTA_TSIM OFF)\n\n# Whether to build VTA FPGA driver (device side only)\nset(USE_VTA_FPGA OFF)\n\n# Whether use Thrust\nset(USE_THRUST OFF)\n\n# Whether to build the TensorFlow TVMDSOOp module\nset(USE_TF_TVMDSOOP OFF)\n\n# Whether to build the PyTorch custom class module\nset(USE_PT_TVMDSOOP OFF)\n\n# Whether to use STL's std::unordered_map or TVM's POD compatible Map\nset(USE_FALLBACK_STL_MAP OFF)\n\n# Whether to use hexagon device\nset(USE_HEXAGON_DEVICE OFF)\nset(USE_HEXAGON_SDK /path/to/sdk)\n\n# Whether to build the hexagon launcher\nset(USE_HEXAGON_LAUNCHER OFF)\n\n# Hexagon architecture to target when compiling TVM itself (not the target for\n# compiling _by_ TVM). This applies to components like the TVM runtime, but is\n# also used to select correct include/library paths from the Hexagon SDK when\n# building offloading runtime for Android.\n# Valid values are v60, v62, v65, v66, v68.\nset(USE_HEXAGON_ARCH \"v66\")\n\n# Whether to use ONNX codegen\nset(USE_TARGET_ONNX OFF)\n\n# Whether enable BNNS runtime\nset(USE_BNNS OFF)\n\n# Whether to use libbacktrace\n# Libbacktrace provides line and column information on stack traces from errors.\n# It is only supported on linux and macOS.\n# Possible values:\n# - AUTO: auto set according to system information and feasibility\n# - ON: enable libbacktrace\n# - OFF: disable libbacktrace\nset(USE_LIBBACKTRACE AUTO)\n\n# Whether to build static libtvm_runtime.a, the default is to build the dynamic\n# version: libtvm_runtime.so.\n#\n# The static runtime library needs to be linked into executables with the linker\n# option --whole-archive (or its equivalent). The reason is that the TVM registry\n# mechanism relies on global constructors being executed at program startup.\n# Global constructors alone are not sufficient for the linker to consider a\n# library member to be used, and some of such library members (object files) may\n# not be included in the final executable. This would make the corresponding\n# runtime functions to be unavailable to the program.\nset(BUILD_STATIC_RUNTIME OFF)\n\n\n# Caches the build so that building is faster when switching between branches.\n# If you switch branches, build and then encounter a linking error, you may\n# need to regenerate the build tree through \"make ..\" (the cache will\n# still provide significant speedups).\n# Possible values:\n# - AUTO: search for path to ccache, disable if not found.\n# - ON: enable ccache by searching for the path to ccache, report an error if not found\n# - OFF: disable ccache\n# - /path/to/ccache: use specific path to ccache\nset(USE_CCACHE AUTO)\n\n# Whether to enable PAPI support in profiling. PAPI provides access to hardware\n# counters while profiling.\n# Possible values:\n# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc\n# - OFF: disable PAPI support.\n# - /path/to/folder/containing/: Path to folder containing papi.pc.\nset(USE_PAPI OFF)\n\n# Whether to use GoogleTest for C++ unit tests. When enabled, the generated\n# build file (e.g. Makefile) will have a target \"cpptest\".\n# Possible values:\n# - ON: enable GoogleTest. The package `GTest` will be required for cmake\n#   to succeed.\n# - OFF: disable GoogleTest.\n# - AUTO: cmake will attempt to find the GTest package, if found GTest will\n#   be enabled, otherwise it will be disabled.\n# Note that cmake will use `find_package` to find GTest. Please use cmake's\n# predefined variables to specify the path to the GTest package if needed.\nset(USE_GTEST AUTO)\n\n# Enable using CUTLASS as a BYOC backend\n# Need to have USE_CUDA=ON\nset(USE_CUTLASS OFF)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/tvm_installers/arm_cuda/config.cmake",
    "content": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# \"License\"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n#   http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\n\n#--------------------------------------------------------------------\n#  Template custom cmake configuration for compiling\n#\n#  This file is used to override the build options in build.\n#  If you want to change the configuration, please use the following\n#  steps. Assume you are on the root directory. First copy the this\n#  file so that any local changes will be ignored by git\n#\n#  $ mkdir build\n#  $ cp cmake/config.cmake build\n#\n#  Next modify the according entries, and then compile by\n#\n#  $ cd build\n#  $ cmake ..\n#\n#  Then build in parallel with 8 threads\n#\n#  $ make -j8\n#--------------------------------------------------------------------\n\n#---------------------------------------------\n# Backend runtimes.\n#---------------------------------------------\n\n# Whether enable CUDA during compile,\n#\n# Possible values:\n# - ON: enable CUDA with cmake's auto search\n# - OFF: disable CUDA\n# - /path/to/cuda: use specific path to cuda toolkit\nset(USE_CUDA ON)\n\n# Whether enable ROCM runtime\n#\n# Possible values:\n# - ON: enable ROCM with cmake's auto search\n# - OFF: disable ROCM\n# - /path/to/rocm: use specific path to rocm\nset(USE_ROCM OFF)\n\n# Whether enable SDAccel runtime\nset(USE_SDACCEL OFF)\n\n# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime\nset(USE_AOCL OFF)\n\n# Whether enable OpenCL runtime\n#\n# Possible values:\n# - ON: enable OpenCL with cmake's auto search\n# - OFF: disable OpenCL\n# - /path/to/opencl-sdk: use specific path to opencl-sdk\nset(USE_OPENCL OFF)\n\n# Whether enable Metal runtime\nset(USE_METAL OFF)\n\n# Whether enable Vulkan runtime\n#\n# Possible values:\n# - ON: enable Vulkan with cmake's auto search\n# - OFF: disable vulkan\n# - /path/to/vulkan-sdk: use specific path to vulkan-sdk\nset(USE_VULKAN OFF)\n\n# Whether enable OpenGL runtime\nset(USE_OPENGL OFF)\n\n# Whether enable MicroTVM runtime\nset(USE_MICRO OFF)\n\n# Whether enable RPC runtime\nset(USE_RPC ON)\n\n# Whether to build the C++ RPC server binary\nset(USE_CPP_RPC OFF)\n\n# Whether to build the iOS RPC server application\nset(USE_IOS_RPC OFF)\n\n# Whether embed stackvm into the runtime\nset(USE_STACKVM_RUNTIME OFF)\n\n# Whether enable tiny embedded graph executor.\nset(USE_GRAPH_EXECUTOR ON)\n\n# Whether enable tiny graph executor with CUDA Graph\nset(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)\n\n# Whether enable pipeline executor.\nset(USE_PIPELINE_EXECUTOR OFF)\n\n# Whether to enable the profiler for the graph executor and vm\nset(USE_PROFILER ON)\n\n# Whether enable microTVM standalone runtime\nset(USE_MICRO_STANDALONE_RUNTIME OFF)\n\n# Whether build with LLVM support\n# Requires LLVM version >= 4.0\n#\n# Possible values:\n# - ON: enable llvm with cmake's find search\n# - OFF: disable llvm, note this will disable CPU codegen\n#        which is needed for most cases\n# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.\nset(USE_LLVM ON)\n\n#---------------------------------------------\n# Contrib libraries\n#---------------------------------------------\n# Whether to build with BYODT software emulated posit custom datatype\n#\n# Possible values:\n# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH\n# - OFF: disable BYODT posit\n#\n# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON\nset(USE_BYODT_POSIT OFF)\n\n# Whether use BLAS, choices: openblas, atlas, apple\nset(USE_BLAS none)\n\n# Whether to use MKL\n# Possible values:\n# - ON: Enable MKL\n# - /path/to/mkl: mkl root path\n# - OFF: Disable MKL\n# set(USE_MKL /opt/intel/mkl) for UNIX\n# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32\n# set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`\nset(USE_MKL OFF)\n\n# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library\nset(USE_MKLDNN OFF)\n\n# Whether use OpenMP thread pool, choices: gnu, intel\n# Note: \"gnu\" uses gomp library, \"intel\" uses iomp5 library\nset(USE_OPENMP none)\n\n# Whether use contrib.random in runtime\nset(USE_RANDOM ON)\n\n# Whether use NNPack\nset(USE_NNPACK OFF)\n\n# Possible values:\n# - ON: enable tflite with cmake's find search\n# - OFF: disable tflite\n# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library\nset(USE_TFLITE OFF)\n\n# /path/to/tensorflow: tensorflow root path when use tflite library\nset(USE_TENSORFLOW_PATH none)\n\n# Required for full builds with TFLite. Not needed for runtime with TFLite.\n# /path/to/flatbuffers: flatbuffers root path when using tflite library\nset(USE_FLATBUFFERS_PATH none)\n\n# Possible values:\n# - OFF: disable tflite support for edgetpu\n# - /path/to/edgetpu: use specific path to edgetpu library\nset(USE_EDGETPU OFF)\n\n# Possible values:\n# - ON: enable cuDNN with cmake's auto search in CUDA directory\n# - OFF: disable cuDNN\n# - /path/to/cudnn: use specific path to cuDNN path\nset(USE_CUDNN OFF)\n\n# Whether use cuBLAS\nset(USE_CUBLAS OFF)\n\n# Whether use MIOpen\nset(USE_MIOPEN OFF)\n\n# Whether use MPS\nset(USE_MPS OFF)\n\n# Whether use rocBlas\nset(USE_ROCBLAS OFF)\n\n# Whether use contrib sort\nset(USE_SORT ON)\n\n# Whether use MKL-DNN (DNNL) codegen\nset(USE_DNNL_CODEGEN OFF)\n\n# Whether to use Arm Compute Library (ACL) codegen\n# We provide 2 separate flags since we cannot build the ACL runtime on x86.\n# This is useful for cases where you want to cross-compile a relay graph\n# on x86 then run on AArch.\n#\n# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.\n#\n# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported\n#                       operators to Arm Compute Library. OFF/ON\n# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL\n#                                     runtime. OFF/ON/\"path/to/ACL\"\nset(USE_ARM_COMPUTE_LIB OFF)\nset(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)\n\n# Whether to build with Arm Ethos-N support\n# Possible values:\n# - OFF: disable Arm Ethos-N support\n# - path/to/arm-ethos-N-stack: use a specific version of the\n#   Ethos-N driver stack\nset(USE_ETHOSN OFF)\n# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine\n# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure\nset(USE_ETHOSN_HW OFF)\n\n# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support\nset(USE_ETHOSU OFF)\n\n# Whether to build with TensorRT codegen or runtime\n# Examples are available here: docs/deploy/tensorrt.rst.\n#\n# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are\n#                        offloaded to TensorRT. OFF/ON\n# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of\n#                        TensorRT library. OFF/ON/\"path/to/TensorRT\"\nset(USE_TENSORRT_CODEGEN OFF)\nset(USE_TENSORRT_RUNTIME OFF)\n\n# Whether use VITIS-AI codegen\nset(USE_VITIS_AI OFF)\n\n# Build Verilator codegen and runtime\nset(USE_VERILATOR OFF)\n\n# Build ANTLR parser for Relay text format\n# Possible values:\n# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)\n# - OFF: disable ANTLR\n# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file\nset(USE_ANTLR OFF)\n\n# Whether use Relay debug mode\nset(USE_RELAY_DEBUG OFF)\n\n# Whether to build fast VTA simulator driver\nset(USE_VTA_FSIM OFF)\n\n# Whether to build cycle-accurate VTA simulator driver\nset(USE_VTA_TSIM OFF)\n\n# Whether to build VTA FPGA driver (device side only)\nset(USE_VTA_FPGA OFF)\n\n# Whether use Thrust\nset(USE_THRUST OFF)\n\n# Whether to build the TensorFlow TVMDSOOp module\nset(USE_TF_TVMDSOOP OFF)\n\n# Whether to build the PyTorch custom class module\nset(USE_PT_TVMDSOOP OFF)\n\n# Whether to use STL's std::unordered_map or TVM's POD compatible Map\nset(USE_FALLBACK_STL_MAP OFF)\n\n# Whether to use hexagon device\nset(USE_HEXAGON_DEVICE OFF)\nset(USE_HEXAGON_SDK /path/to/sdk)\n\n# Whether to build the hexagon launcher\nset(USE_HEXAGON_LAUNCHER OFF)\n\n# Hexagon architecture to target when compiling TVM itself (not the target for\n# compiling _by_ TVM). This applies to components like the TVM runtime, but is\n# also used to select correct include/library paths from the Hexagon SDK when\n# building offloading runtime for Android.\n# Valid values are v60, v62, v65, v66, v68.\nset(USE_HEXAGON_ARCH \"v66\")\n\n# Whether to use ONNX codegen\nset(USE_TARGET_ONNX OFF)\n\n# Whether enable BNNS runtime\nset(USE_BNNS OFF)\n\n# Whether to use libbacktrace\n# Libbacktrace provides line and column information on stack traces from errors.\n# It is only supported on linux and macOS.\n# Possible values:\n# - AUTO: auto set according to system information and feasibility\n# - ON: enable libbacktrace\n# - OFF: disable libbacktrace\nset(USE_LIBBACKTRACE AUTO)\n\n# Whether to build static libtvm_runtime.a, the default is to build the dynamic\n# version: libtvm_runtime.so.\n#\n# The static runtime library needs to be linked into executables with the linker\n# option --whole-archive (or its equivalent). The reason is that the TVM registry\n# mechanism relies on global constructors being executed at program startup.\n# Global constructors alone are not sufficient for the linker to consider a\n# library member to be used, and some of such library members (object files) may\n# not be included in the final executable. This would make the corresponding\n# runtime functions to be unavailable to the program.\nset(BUILD_STATIC_RUNTIME OFF)\n\n\n# Caches the build so that building is faster when switching between branches.\n# If you switch branches, build and then encounter a linking error, you may\n# need to regenerate the build tree through \"make ..\" (the cache will\n# still provide significant speedups).\n# Possible values:\n# - AUTO: search for path to ccache, disable if not found.\n# - ON: enable ccache by searching for the path to ccache, report an error if not found\n# - OFF: disable ccache\n# - /path/to/ccache: use specific path to ccache\nset(USE_CCACHE AUTO)\n\n# Whether to enable PAPI support in profiling. PAPI provides access to hardware\n# counters while profiling.\n# Possible values:\n# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc\n# - OFF: disable PAPI support.\n# - /path/to/folder/containing/: Path to folder containing papi.pc.\nset(USE_PAPI OFF)\n\n# Whether to use GoogleTest for C++ unit tests. When enabled, the generated\n# build file (e.g. Makefile) will have a target \"cpptest\".\n# Possible values:\n# - ON: enable GoogleTest. The package `GTest` will be required for cmake\n#   to succeed.\n# - OFF: disable GoogleTest.\n# - AUTO: cmake will attempt to find the GTest package, if found GTest will\n#   be enabled, otherwise it will be disabled.\n# Note that cmake will use `find_package` to find GTest. Please use cmake's\n# predefined variables to specify the path to the GTest package if needed.\nset(USE_GTEST AUTO)\n\n# Enable using CUTLASS as a BYOC backend\n# Need to have USE_CUDA=ON\nset(USE_CUTLASS OFF)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/tvm_installers/x86/config.cmake",
    "content": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# \"License\"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n#   http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\n\n#--------------------------------------------------------------------\n#  Template custom cmake configuration for compiling\n#\n#  This file is used to override the build options in build.\n#  If you want to change the configuration, please use the following\n#  steps. Assume you are on the root directory. First copy the this\n#  file so that any local changes will be ignored by git\n#\n#  $ mkdir build\n#  $ cp cmake/config.cmake build\n#\n#  Next modify the according entries, and then compile by\n#\n#  $ cd build\n#  $ cmake ..\n#\n#  Then build in parallel with 8 threads\n#\n#  $ make -j8\n#--------------------------------------------------------------------\n\n#---------------------------------------------\n# Backend runtimes.\n#---------------------------------------------\n\n# Whether enable CUDA during compile,\n#\n# Possible values:\n# - ON: enable CUDA with cmake's auto search\n# - OFF: disable CUDA\n# - /path/to/cuda: use specific path to cuda toolkit\nset(USE_CUDA OFF)\n\n# Whether enable ROCM runtime\n#\n# Possible values:\n# - ON: enable ROCM with cmake's auto search\n# - OFF: disable ROCM\n# - /path/to/rocm: use specific path to rocm\nset(USE_ROCM OFF)\n\n# Whether enable SDAccel runtime\nset(USE_SDACCEL OFF)\n\n# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime\nset(USE_AOCL OFF)\n\n# Whether enable OpenCL runtime\n#\n# Possible values:\n# - ON: enable OpenCL with cmake's auto search\n# - OFF: disable OpenCL\n# - /path/to/opencl-sdk: use specific path to opencl-sdk\nset(USE_OPENCL OFF)\n\n# Whether enable Metal runtime\nset(USE_METAL OFF)\n\n# Whether enable Vulkan runtime\n#\n# Possible values:\n# - ON: enable Vulkan with cmake's auto search\n# - OFF: disable vulkan\n# - /path/to/vulkan-sdk: use specific path to vulkan-sdk\nset(USE_VULKAN OFF)\n\n# Whether enable OpenGL runtime\nset(USE_OPENGL OFF)\n\n# Whether enable MicroTVM runtime\nset(USE_MICRO OFF)\n\n# Whether enable RPC runtime\nset(USE_RPC ON)\n\n# Whether to build the C++ RPC server binary\nset(USE_CPP_RPC OFF)\n\n# Whether to build the iOS RPC server application\nset(USE_IOS_RPC OFF)\n\n# Whether embed stackvm into the runtime\nset(USE_STACKVM_RUNTIME OFF)\n\n# Whether enable tiny embedded graph executor.\nset(USE_GRAPH_EXECUTOR ON)\n\n# Whether enable tiny graph executor with CUDA Graph\nset(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)\n\n# Whether enable pipeline executor.\nset(USE_PIPELINE_EXECUTOR OFF)\n\n# Whether to enable the profiler for the graph executor and vm\nset(USE_PROFILER ON)\n\n# Whether enable microTVM standalone runtime\nset(USE_MICRO_STANDALONE_RUNTIME OFF)\n\n# Whether build with LLVM support\n# Requires LLVM version >= 4.0\n#\n# Possible values:\n# - ON: enable llvm with cmake's find search\n# - OFF: disable llvm, note this will disable CPU codegen\n#        which is needed for most cases\n# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.\nset(USE_LLVM ON)\n\n#---------------------------------------------\n# Contrib libraries\n#---------------------------------------------\n# Whether to build with BYODT software emulated posit custom datatype\n#\n# Possible values:\n# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH\n# - OFF: disable BYODT posit\n#\n# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON\nset(USE_BYODT_POSIT OFF)\n\n# Whether use BLAS, choices: openblas, atlas, apple\nset(USE_BLAS none)\n\n# Whether to use MKL\n# Possible values:\n# - ON: Enable MKL\n# - /path/to/mkl: mkl root path\n# - OFF: Disable MKL\n# set(USE_MKL /opt/intel/mkl) for UNIX\n# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32\n# set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`\nset(USE_MKL OFF)\n\n# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library\nset(USE_MKLDNN OFF)\n\n# Whether use OpenMP thread pool, choices: gnu, intel\n# Note: \"gnu\" uses gomp library, \"intel\" uses iomp5 library\nset(USE_OPENMP none)\n\n# Whether use contrib.random in runtime\nset(USE_RANDOM ON)\n\n# Whether use NNPack\nset(USE_NNPACK OFF)\n\n# Possible values:\n# - ON: enable tflite with cmake's find search\n# - OFF: disable tflite\n# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library\nset(USE_TFLITE OFF)\n\n# /path/to/tensorflow: tensorflow root path when use tflite library\nset(USE_TENSORFLOW_PATH none)\n\n# Required for full builds with TFLite. Not needed for runtime with TFLite.\n# /path/to/flatbuffers: flatbuffers root path when using tflite library\nset(USE_FLATBUFFERS_PATH none)\n\n# Possible values:\n# - OFF: disable tflite support for edgetpu\n# - /path/to/edgetpu: use specific path to edgetpu library\nset(USE_EDGETPU OFF)\n\n# Possible values:\n# - ON: enable cuDNN with cmake's auto search in CUDA directory\n# - OFF: disable cuDNN\n# - /path/to/cudnn: use specific path to cuDNN path\nset(USE_CUDNN OFF)\n\n# Whether use cuBLAS\nset(USE_CUBLAS OFF)\n\n# Whether use MIOpen\nset(USE_MIOPEN OFF)\n\n# Whether use MPS\nset(USE_MPS OFF)\n\n# Whether use rocBlas\nset(USE_ROCBLAS OFF)\n\n# Whether use contrib sort\nset(USE_SORT ON)\n\n# Whether use MKL-DNN (DNNL) codegen\nset(USE_DNNL_CODEGEN OFF)\n\n# Whether to use Arm Compute Library (ACL) codegen\n# We provide 2 separate flags since we cannot build the ACL runtime on x86.\n# This is useful for cases where you want to cross-compile a relay graph\n# on x86 then run on AArch.\n#\n# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.\n#\n# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported\n#                       operators to Arm Compute Library. OFF/ON\n# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL\n#                                     runtime. OFF/ON/\"path/to/ACL\"\nset(USE_ARM_COMPUTE_LIB OFF)\nset(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)\n\n# Whether to build with Arm Ethos-N support\n# Possible values:\n# - OFF: disable Arm Ethos-N support\n# - path/to/arm-ethos-N-stack: use a specific version of the\n#   Ethos-N driver stack\nset(USE_ETHOSN OFF)\n# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine\n# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure\nset(USE_ETHOSN_HW OFF)\n\n# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support\nset(USE_ETHOSU OFF)\n\n# Whether to build with TensorRT codegen or runtime\n# Examples are available here: docs/deploy/tensorrt.rst.\n#\n# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are\n#                        offloaded to TensorRT. OFF/ON\n# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of\n#                        TensorRT library. OFF/ON/\"path/to/TensorRT\"\nset(USE_TENSORRT_CODEGEN OFF)\nset(USE_TENSORRT_RUNTIME OFF)\n\n# Whether use VITIS-AI codegen\nset(USE_VITIS_AI OFF)\n\n# Build Verilator codegen and runtime\nset(USE_VERILATOR OFF)\n\n# Build ANTLR parser for Relay text format\n# Possible values:\n# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)\n# - OFF: disable ANTLR\n# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file\nset(USE_ANTLR OFF)\n\n# Whether use Relay debug mode\nset(USE_RELAY_DEBUG OFF)\n\n# Whether to build fast VTA simulator driver\nset(USE_VTA_FSIM OFF)\n\n# Whether to build cycle-accurate VTA simulator driver\nset(USE_VTA_TSIM OFF)\n\n# Whether to build VTA FPGA driver (device side only)\nset(USE_VTA_FPGA OFF)\n\n# Whether use Thrust\nset(USE_THRUST OFF)\n\n# Whether to build the TensorFlow TVMDSOOp module\nset(USE_TF_TVMDSOOP OFF)\n\n# Whether to build the PyTorch custom class module\nset(USE_PT_TVMDSOOP OFF)\n\n# Whether to use STL's std::unordered_map or TVM's POD compatible Map\nset(USE_FALLBACK_STL_MAP OFF)\n\n# Whether to use hexagon device\nset(USE_HEXAGON_DEVICE OFF)\nset(USE_HEXAGON_SDK /path/to/sdk)\n\n# Whether to build the hexagon launcher\nset(USE_HEXAGON_LAUNCHER OFF)\n\n# Hexagon architecture to target when compiling TVM itself (not the target for\n# compiling _by_ TVM). This applies to components like the TVM runtime, but is\n# also used to select correct include/library paths from the Hexagon SDK when\n# building offloading runtime for Android.\n# Valid values are v60, v62, v65, v66, v68.\nset(USE_HEXAGON_ARCH \"v66\")\n\n# Whether to use ONNX codegen\nset(USE_TARGET_ONNX OFF)\n\n# Whether enable BNNS runtime\nset(USE_BNNS OFF)\n\n# Whether to use libbacktrace\n# Libbacktrace provides line and column information on stack traces from errors.\n# It is only supported on linux and macOS.\n# Possible values:\n# - AUTO: auto set according to system information and feasibility\n# - ON: enable libbacktrace\n# - OFF: disable libbacktrace\nset(USE_LIBBACKTRACE AUTO)\n\n# Whether to build static libtvm_runtime.a, the default is to build the dynamic\n# version: libtvm_runtime.so.\n#\n# The static runtime library needs to be linked into executables with the linker\n# option --whole-archive (or its equivalent). The reason is that the TVM registry\n# mechanism relies on global constructors being executed at program startup.\n# Global constructors alone are not sufficient for the linker to consider a\n# library member to be used, and some of such library members (object files) may\n# not be included in the final executable. This would make the corresponding\n# runtime functions to be unavailable to the program.\nset(BUILD_STATIC_RUNTIME OFF)\n\n\n# Caches the build so that building is faster when switching between branches.\n# If you switch branches, build and then encounter a linking error, you may\n# need to regenerate the build tree through \"make ..\" (the cache will\n# still provide significant speedups).\n# Possible values:\n# - AUTO: search for path to ccache, disable if not found.\n# - ON: enable ccache by searching for the path to ccache, report an error if not found\n# - OFF: disable ccache\n# - /path/to/ccache: use specific path to ccache\nset(USE_CCACHE AUTO)\n\n# Whether to enable PAPI support in profiling. PAPI provides access to hardware\n# counters while profiling.\n# Possible values:\n# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc\n# - OFF: disable PAPI support.\n# - /path/to/folder/containing/: Path to folder containing papi.pc.\nset(USE_PAPI OFF)\n\n# Whether to use GoogleTest for C++ unit tests. When enabled, the generated\n# build file (e.g. Makefile) will have a target \"cpptest\".\n# Possible values:\n# - ON: enable GoogleTest. The package `GTest` will be required for cmake\n#   to succeed.\n# - OFF: disable GoogleTest.\n# - AUTO: cmake will attempt to find the GTest package, if found GTest will\n#   be enabled, otherwise it will be disabled.\n# Note that cmake will use `find_package` to find GTest. Please use cmake's\n# predefined variables to specify the path to the GTest package if needed.\nset(USE_GTEST AUTO)\n\n# Enable using CUTLASS as a BYOC backend\n# Need to have USE_CUDA=ON\nset(USE_CUTLASS OFF)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/installers/tvm_installers/x86_cuda/config.cmake",
    "content": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# \"License\"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n#   http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\n\n#--------------------------------------------------------------------\n#  Template custom cmake configuration for compiling\n#\n#  This file is used to override the build options in build.\n#  If you want to change the configuration, please use the following\n#  steps. Assume you are on the root directory. First copy the this\n#  file so that any local changes will be ignored by git\n#\n#  $ mkdir build\n#  $ cp cmake/config.cmake build\n#\n#  Next modify the according entries, and then compile by\n#\n#  $ cd build\n#  $ cmake ..\n#\n#  Then build in parallel with 8 threads\n#\n#  $ make -j8\n#--------------------------------------------------------------------\n\n#---------------------------------------------\n# Backend runtimes.\n#---------------------------------------------\n\n# Whether enable CUDA during compile,\n#\n# Possible values:\n# - ON: enable CUDA with cmake's auto search\n# - OFF: disable CUDA\n# - /path/to/cuda: use specific path to cuda toolkit\nset(USE_CUDA ON)\n\n# Whether enable ROCM runtime\n#\n# Possible values:\n# - ON: enable ROCM with cmake's auto search\n# - OFF: disable ROCM\n# - /path/to/rocm: use specific path to rocm\nset(USE_ROCM OFF)\n\n# Whether enable SDAccel runtime\nset(USE_SDACCEL OFF)\n\n# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime\nset(USE_AOCL OFF)\n\n# Whether enable OpenCL runtime\n#\n# Possible values:\n# - ON: enable OpenCL with cmake's auto search\n# - OFF: disable OpenCL\n# - /path/to/opencl-sdk: use specific path to opencl-sdk\nset(USE_OPENCL OFF)\n\n# Whether enable Metal runtime\nset(USE_METAL OFF)\n\n# Whether enable Vulkan runtime\n#\n# Possible values:\n# - ON: enable Vulkan with cmake's auto search\n# - OFF: disable vulkan\n# - /path/to/vulkan-sdk: use specific path to vulkan-sdk\nset(USE_VULKAN OFF)\n\n# Whether enable OpenGL runtime\nset(USE_OPENGL OFF)\n\n# Whether enable MicroTVM runtime\nset(USE_MICRO OFF)\n\n# Whether enable RPC runtime\nset(USE_RPC ON)\n\n# Whether to build the C++ RPC server binary\nset(USE_CPP_RPC OFF)\n\n# Whether to build the iOS RPC server application\nset(USE_IOS_RPC OFF)\n\n# Whether embed stackvm into the runtime\nset(USE_STACKVM_RUNTIME OFF)\n\n# Whether enable tiny embedded graph executor.\nset(USE_GRAPH_EXECUTOR ON)\n\n# Whether enable tiny graph executor with CUDA Graph\nset(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)\n\n# Whether enable pipeline executor.\nset(USE_PIPELINE_EXECUTOR OFF)\n\n# Whether to enable the profiler for the graph executor and vm\nset(USE_PROFILER ON)\n\n# Whether enable microTVM standalone runtime\nset(USE_MICRO_STANDALONE_RUNTIME OFF)\n\n# Whether build with LLVM support\n# Requires LLVM version >= 4.0\n#\n# Possible values:\n# - ON: enable llvm with cmake's find search\n# - OFF: disable llvm, note this will disable CPU codegen\n#        which is needed for most cases\n# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.\nset(USE_LLVM ON)\n\n#---------------------------------------------\n# Contrib libraries\n#---------------------------------------------\n# Whether to build with BYODT software emulated posit custom datatype\n#\n# Possible values:\n# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH\n# - OFF: disable BYODT posit\n#\n# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON\nset(USE_BYODT_POSIT OFF)\n\n# Whether use BLAS, choices: openblas, atlas, apple\nset(USE_BLAS none)\n\n# Whether to use MKL\n# Possible values:\n# - ON: Enable MKL\n# - /path/to/mkl: mkl root path\n# - OFF: Disable MKL\n# set(USE_MKL /opt/intel/mkl) for UNIX\n# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32\n# set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`\nset(USE_MKL OFF)\n\n# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library\nset(USE_MKLDNN OFF)\n\n# Whether use OpenMP thread pool, choices: gnu, intel\n# Note: \"gnu\" uses gomp library, \"intel\" uses iomp5 library\nset(USE_OPENMP none)\n\n# Whether use contrib.random in runtime\nset(USE_RANDOM ON)\n\n# Whether use NNPack\nset(USE_NNPACK OFF)\n\n# Possible values:\n# - ON: enable tflite with cmake's find search\n# - OFF: disable tflite\n# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library\nset(USE_TFLITE OFF)\n\n# /path/to/tensorflow: tensorflow root path when use tflite library\nset(USE_TENSORFLOW_PATH none)\n\n# Required for full builds with TFLite. Not needed for runtime with TFLite.\n# /path/to/flatbuffers: flatbuffers root path when using tflite library\nset(USE_FLATBUFFERS_PATH none)\n\n# Possible values:\n# - OFF: disable tflite support for edgetpu\n# - /path/to/edgetpu: use specific path to edgetpu library\nset(USE_EDGETPU OFF)\n\n# Possible values:\n# - ON: enable cuDNN with cmake's auto search in CUDA directory\n# - OFF: disable cuDNN\n# - /path/to/cudnn: use specific path to cuDNN path\nset(USE_CUDNN OFF)\n\n# Whether use cuBLAS\nset(USE_CUBLAS OFF)\n\n# Whether use MIOpen\nset(USE_MIOPEN OFF)\n\n# Whether use MPS\nset(USE_MPS OFF)\n\n# Whether use rocBlas\nset(USE_ROCBLAS OFF)\n\n# Whether use contrib sort\nset(USE_SORT ON)\n\n# Whether use MKL-DNN (DNNL) codegen\nset(USE_DNNL_CODEGEN OFF)\n\n# Whether to use Arm Compute Library (ACL) codegen\n# We provide 2 separate flags since we cannot build the ACL runtime on x86.\n# This is useful for cases where you want to cross-compile a relay graph\n# on x86 then run on AArch.\n#\n# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.\n#\n# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported\n#                       operators to Arm Compute Library. OFF/ON\n# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL\n#                                     runtime. OFF/ON/\"path/to/ACL\"\nset(USE_ARM_COMPUTE_LIB OFF)\nset(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)\n\n# Whether to build with Arm Ethos-N support\n# Possible values:\n# - OFF: disable Arm Ethos-N support\n# - path/to/arm-ethos-N-stack: use a specific version of the\n#   Ethos-N driver stack\nset(USE_ETHOSN OFF)\n# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine\n# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure\nset(USE_ETHOSN_HW OFF)\n\n# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support\nset(USE_ETHOSU OFF)\n\n# Whether to build with TensorRT codegen or runtime\n# Examples are available here: docs/deploy/tensorrt.rst.\n#\n# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are\n#                        offloaded to TensorRT. OFF/ON\n# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of\n#                        TensorRT library. OFF/ON/\"path/to/TensorRT\"\nset(USE_TENSORRT_CODEGEN OFF)\nset(USE_TENSORRT_RUNTIME OFF)\n\n# Whether use VITIS-AI codegen\nset(USE_VITIS_AI OFF)\n\n# Build Verilator codegen and runtime\nset(USE_VERILATOR OFF)\n\n# Build ANTLR parser for Relay text format\n# Possible values:\n# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)\n# - OFF: disable ANTLR\n# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file\nset(USE_ANTLR OFF)\n\n# Whether use Relay debug mode\nset(USE_RELAY_DEBUG OFF)\n\n# Whether to build fast VTA simulator driver\nset(USE_VTA_FSIM OFF)\n\n# Whether to build cycle-accurate VTA simulator driver\nset(USE_VTA_TSIM OFF)\n\n# Whether to build VTA FPGA driver (device side only)\nset(USE_VTA_FPGA OFF)\n\n# Whether use Thrust\nset(USE_THRUST OFF)\n\n# Whether to build the TensorFlow TVMDSOOp module\nset(USE_TF_TVMDSOOP OFF)\n\n# Whether to build the PyTorch custom class module\nset(USE_PT_TVMDSOOP OFF)\n\n# Whether to use STL's std::unordered_map or TVM's POD compatible Map\nset(USE_FALLBACK_STL_MAP OFF)\n\n# Whether to use hexagon device\nset(USE_HEXAGON_DEVICE OFF)\nset(USE_HEXAGON_SDK /path/to/sdk)\n\n# Whether to build the hexagon launcher\nset(USE_HEXAGON_LAUNCHER OFF)\n\n# Hexagon architecture to target when compiling TVM itself (not the target for\n# compiling _by_ TVM). This applies to components like the TVM runtime, but is\n# also used to select correct include/library paths from the Hexagon SDK when\n# building offloading runtime for Android.\n# Valid values are v60, v62, v65, v66, v68.\nset(USE_HEXAGON_ARCH \"v66\")\n\n# Whether to use ONNX codegen\nset(USE_TARGET_ONNX OFF)\n\n# Whether enable BNNS runtime\nset(USE_BNNS OFF)\n\n# Whether to use libbacktrace\n# Libbacktrace provides line and column information on stack traces from errors.\n# It is only supported on linux and macOS.\n# Possible values:\n# - AUTO: auto set according to system information and feasibility\n# - ON: enable libbacktrace\n# - OFF: disable libbacktrace\nset(USE_LIBBACKTRACE AUTO)\n\n# Whether to build static libtvm_runtime.a, the default is to build the dynamic\n# version: libtvm_runtime.so.\n#\n# The static runtime library needs to be linked into executables with the linker\n# option --whole-archive (or its equivalent). The reason is that the TVM registry\n# mechanism relies on global constructors being executed at program startup.\n# Global constructors alone are not sufficient for the linker to consider a\n# library member to be used, and some of such library members (object files) may\n# not be included in the final executable. This would make the corresponding\n# runtime functions to be unavailable to the program.\nset(BUILD_STATIC_RUNTIME OFF)\n\n\n# Caches the build so that building is faster when switching between branches.\n# If you switch branches, build and then encounter a linking error, you may\n# need to regenerate the build tree through \"make ..\" (the cache will\n# still provide significant speedups).\n# Possible values:\n# - AUTO: search for path to ccache, disable if not found.\n# - ON: enable ccache by searching for the path to ccache, report an error if not found\n# - OFF: disable ccache\n# - /path/to/ccache: use specific path to ccache\nset(USE_CCACHE AUTO)\n\n# Whether to enable PAPI support in profiling. PAPI provides access to hardware\n# counters while profiling.\n# Possible values:\n# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc\n# - OFF: disable PAPI support.\n# - /path/to/folder/containing/: Path to folder containing papi.pc.\nset(USE_PAPI OFF)\n\n# Whether to use GoogleTest for C++ unit tests. When enabled, the generated\n# build file (e.g. Makefile) will have a target \"cpptest\".\n# Possible values:\n# - ON: enable GoogleTest. The package `GTest` will be required for cmake\n#   to succeed.\n# - OFF: disable GoogleTest.\n# - AUTO: cmake will attempt to find the GTest package, if found GTest will\n#   be enabled, otherwise it will be disabled.\n# Note that cmake will use `find_package` to find GTest. Please use cmake's\n# predefined variables to specify the path to the GTest package if needed.\nset(USE_GTEST AUTO)\n\n# Enable using CUTLASS as a BYOC backend\n# Need to have USE_CUDA=ON\nset(USE_CUTLASS OFF)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/base.py",
    "content": "import abc\nfrom typing import Dict, Union\n\nfrom loguru import logger\n\nfrom nebullvm.core.models import Device, DeviceType\nfrom nebullvm.tools.feedback_collector import FeedbackCollector\nfrom nebullvm.tools.utils import check_device\n\n\nclass Operation(abc.ABC):\n    def __init__(self):\n        self._state = {}\n        self.device = Device(DeviceType.CPU)\n        self.execute_count = 0\n        self.logger = logger\n        self.feedback_collector = None\n\n    def set_feedback_collector(self, feedback_collector: FeedbackCollector):\n        self.feedback_collector = feedback_collector\n        for value in self.__dict__.values():\n            if isinstance(value, Operation):\n                value.set_feedback_collector(feedback_collector)\n\n    @abc.abstractmethod\n    def execute(self, **kwargs):\n        raise NotImplementedError()\n\n    @property\n    def state(self) -> Dict[str, any]:\n        return self._state\n\n    def to(self, device: Union[str, Device]):\n        if isinstance(device, str):\n            self.device = check_device(device)\n        else:\n            self.device = device\n        return self\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/conversions/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/conversions/converters.py",
    "content": "import abc\nfrom pathlib import Path\nfrom typing import Optional, List, Union\n\nfrom nebullvm.core.models import DeviceType, DeepLearningFramework, ModelParams\nfrom nebullvm.operations.base import Operation\nfrom nebullvm.operations.conversions.pytorch import convert_torch_to_onnx\nfrom nebullvm.operations.conversions.tensorflow import convert_tf_to_onnx\nfrom nebullvm.optional_modules.onnx import onnx\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.data import DataManager\n\n\nclass Converter(Operation, abc.ABC):\n    ONNX_EXTENSION = \".onnx\"\n    TORCH_EXTENSION = \".pt\"\n    TF_EXTENSION = \".pb\"\n    SUPPORTED_DEVICES = [DeviceType.GPU, DeviceType.CPU]\n\n    def __init__(self, model_name: Optional[str] = None):\n        super().__init__()\n        self.model = None\n        self.data = None\n        self.converted_models = None\n        self.model_params = None\n        self.device = None\n        self.model_name = model_name or \"temp\"\n\n    def set_state(\n        self, model: Union[torch.nn.Module, tf.Module, str], data: DataManager\n    ):\n        self.model = model\n        self.data = data\n        return self\n\n    def get_result(self) -> List:\n        return [model for model in self.converted_models if model is not None]\n\n\nclass PytorchConverter(Converter):\n    DEST_FRAMEWORKS = [DeepLearningFramework.NUMPY]\n\n    def execute(\n        self,\n        save_path: Path,\n        model_params: ModelParams,\n    ):\n        self.converted_models = [self.model]\n\n        if self.device.type not in self.SUPPORTED_DEVICES:\n            return\n\n        for framework in self.DEST_FRAMEWORKS:\n            if framework is DeepLearningFramework.NUMPY:\n                self.onnx_conversion(save_path, model_params)\n            else:\n                raise NotImplementedError()\n\n    def onnx_conversion(self, save_path, model_params):\n        onnx_path = save_path / f\"{self.model_name}{self.ONNX_EXTENSION}\"\n        onnx_model_path = convert_torch_to_onnx(\n            torch_model=self.model,\n            input_data=self.data,\n            model_params=model_params,\n            output_file_path=onnx_path,\n            device=self.device,\n        )\n        if self.converted_models is None:\n            self.converted_models = [onnx_model_path]\n        else:\n            self.converted_models.append(onnx_model_path)\n\n    def tensorflow_conversion(self):\n        # TODO: Implement conversion from Pytorch to Tensorflow\n        raise NotImplementedError()\n\n\nclass TensorflowConverter(Converter):\n    DEST_FRAMEWORKS = [DeepLearningFramework.NUMPY]\n\n    def execute(\n        self,\n        save_path: Path,\n        model_params: ModelParams,\n    ):\n        self.converted_models = [self.model]\n\n        if self.device.type not in self.SUPPORTED_DEVICES:\n            return\n\n        for framework in self.DEST_FRAMEWORKS:\n            if framework is DeepLearningFramework.NUMPY:\n                self.onnx_conversion(save_path, model_params)\n            else:\n                raise NotImplementedError()\n\n    def onnx_conversion(self, save_path, model_params):\n        onnx_path = save_path / f\"{self.model_name}{self.ONNX_EXTENSION}\"\n        onnx_model_path = convert_tf_to_onnx(\n            model=self.model,\n            model_params=model_params,\n            output_file_path=onnx_path,\n        )\n        if self.converted_models is None:\n            self.converted_models = [onnx_model_path]\n        else:\n            self.converted_models.append(onnx_model_path)\n\n    def pytorch_conversion(self):\n        # TODO: Implement conversion from Tensorflow to Pytorch\n        raise NotImplementedError()\n\n\nclass ONNXConverter(Converter):\n    DEST_FRAMEWORKS = []\n\n    def execute(self, save_path, model_params):\n        onnx_path = save_path / f\"{self.model_name}{self.ONNX_EXTENSION}\"\n        try:\n            model_onnx = onnx.load(str(self.model))\n            onnx.save(model_onnx, str(onnx_path))\n        except Exception:\n            self.logger.error(\n                \"The provided onnx model path is invalid. Please provide\"\n                \" a valid path to a model in order to use Nebullvm.\"\n            )\n            self.converted_models = []\n\n        self.converted_models = [str(onnx_path)]\n\n    def tensorflow_conversion(self):\n        # TODO: Implement conversion from ONNX to Tensorflow\n        raise NotImplementedError()\n\n    def pytorch_conversion(self):\n        # TODO: Implement conversion from ONNX to Pytorch\n        raise NotImplementedError()\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/conversions/huggingface.py",
    "content": "from typing import (\n    List,\n    Dict,\n    Sequence,\n    Optional,\n)\n\nimport numpy as np\n\nfrom nebullvm.core.models import Device\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.optional_modules.huggingface import (\n    PreTrainedTokenizer,\n    PreTrainedModel,\n)\nfrom nebullvm.tools.huggingface import (\n    get_output_structure_from_dict,\n    get_output_structure_from_text,\n    PyTorchTransformerWrapper,\n    TensorFlowTransformerWrapper,\n)\nfrom nebullvm.tools.utils import is_dict_type\n\n\nclass _HFTextDataset(Sequence):\n    def __init__(\n        self,\n        input_texts: List,\n        ys: Optional[List],\n        keywords: List[str],\n        batch_size: int,\n        tokenizer: PreTrainedTokenizer,\n        tokenizer_args: Dict,\n    ):\n        self._input_texts = input_texts\n        self._ys = ys\n        self._bs = batch_size\n        self._keys = keywords\n        self._tokenizer = tokenizer\n        if self._tokenizer.pad_token is None:\n            self._tokenizer.pad_token = self._tokenizer.eos_token\n        _tokenizer_args = {\"truncation\": True, \"padding\": True}\n        _tokenizer_args.update(tokenizer_args)\n        self._tokenizer_args = _tokenizer_args\n\n    def __getitem__(self, item: int):\n        pointer = self._bs * item\n        if pointer >= len(self._input_texts):\n            raise IndexError\n        mini_batch = self._input_texts[\n            pointer : pointer + self._bs  # noqa E203\n        ]\n        if self._ys is not None:\n            mini_batch_y = self._ys[pointer : pointer + self._bs]  # noqa E203\n        else:\n            mini_batch_y = None\n        encoded_inputs = self._tokenizer(mini_batch, **self._tokenizer_args)\n        return tuple(encoded_inputs[key] for key in self._keys), mini_batch_y\n\n    def __len__(self):\n        return len(self._input_texts) // self._bs\n\n\nclass _HFDictDataset(Sequence):\n    def __init__(\n        self,\n        input_data: List,\n        ys: Optional[List],\n        keywords: List[str],\n    ):\n        self._input_data = input_data\n        self._ys = ys\n        self._keys = keywords\n\n    def __getitem__(self, item: int):\n        pointer = item\n        if pointer >= len(self._input_data):\n            raise IndexError\n        mini_batch = self._input_data[pointer]\n        if self._ys is not None:\n            mini_batch_y = self._ys[pointer]\n        else:\n            mini_batch_y = None\n        return (\n            tuple(self._concatenate(mini_batch, key) for key in self._keys),\n            mini_batch_y,\n        )\n\n    def __len__(self):\n        return len(self._input_data)\n\n    @staticmethod\n    def _concatenate(mini_batch, key):\n        if isinstance(mini_batch[key], torch.Tensor):\n            return torch.concat([mini_batch[key]])\n        elif isinstance(mini_batch[key], tf.Tensor):\n            return tf.concat([mini_batch[key]], 0)\n        else:\n            return np.concatenate([mini_batch[key]])\n\n\ndef convert_hf_model(\n    model: PreTrainedModel,\n    input_data: List,\n    device: Device,\n    tokenizer: Optional[PreTrainedTokenizer] = None,\n    tokenizer_args: Optional[Dict] = None,\n    batch_size: int = 1,\n    **kwargs,\n):\n    if is_dict_type(input_data[0]):\n        # already tokenized data\n        if \"labels\" in input_data[0]:\n            labels = [data.pop(\"labels\") for data in input_data]\n        else:\n            labels = None\n        input_example = input_data[0]\n        output_structure, output_type = get_output_structure_from_dict(\n            input_example=input_example,\n            model=model,\n            device=device,\n        )\n        input_data = _HFDictDataset(\n            input_data=input_data,\n            ys=labels,\n            keywords=list(input_example.keys()),\n        )\n\n    else:\n        assert tokenizer is not None, (\n            \"Tokenizer is needed when passing data in string format. Please \"\n            \"provide the tokenizer as keyword argument.\"\n        )\n        if tokenizer_args is None:\n            tokenizer_args = {}\n        if not isinstance(input_data[0], str):\n            ys = [data[1] for data in input_data]\n            input_data = [data[0] for data in input_data]\n        else:\n            ys = None\n        output_structure, output_type = get_output_structure_from_text(\n            text=input_data[0],\n            model=model,\n            tokenizer=tokenizer,\n            tokenizer_args=tokenizer_args,\n            device=device,\n        )\n        input_example = tokenizer(input_data, **tokenizer_args)\n        input_data = _HFTextDataset(\n            input_texts=input_data,\n            ys=ys,\n            keywords=list(input_example.keys()),\n            batch_size=batch_size,\n            tokenizer=tokenizer,\n            tokenizer_args=tokenizer_args,\n        )\n    if isinstance(model, torch.nn.Module):\n        wrapper_model = PyTorchTransformerWrapper(\n            core_model=model, encoded_input=input_example\n        )\n    else:\n        wrapper_model = TensorFlowTransformerWrapper(\n            core_model=model, encoded_input=input_example\n        )\n\n    return (\n        wrapper_model,\n        input_data,\n        list(wrapper_model.inputs_types.keys()),\n        output_structure,\n        output_type,\n    )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/conversions/pytorch.py",
    "content": "from contextlib import nullcontext\nfrom pathlib import Path\n\nfrom loguru import logger\n\nfrom nebullvm.config import ONNX_OPSET_VERSION\nfrom nebullvm.core.models import ModelParams, Device, DeviceType, DataType\nfrom nebullvm.optional_modules.torch import torch, Module\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.pytorch import (\n    create_model_inputs_torch,\n)\n\n\n@torch.inference_mode()\ndef convert_torch_to_onnx(\n    torch_model: Module,\n    input_data: DataManager,\n    model_params: ModelParams,\n    output_file_path: Path,\n    device: Device,\n):\n    \"\"\"Function importing a custom model in pytorch and converting it in ONNX\n\n    Args:\n        torch_model (Module): Pytorch model.\n        input_data (DataManager): Custom data provided by user to be\n        used as input for the converter.\n        model_params (ModelParams): Model Parameters as input sizes and\n            dynamic axis information.\n        output_file_path (str or Path): Path where storing the output\n            ONNX file.\n        device (Device): Device where the model will be run.\n    \"\"\"\n\n    if input_data is not None:\n        input_tensors = list(input_data.get_list(1)[0])\n    else:\n        input_tensors = create_model_inputs_torch(model_params.input_infos)\n\n    output_sizes = model_params.output_sizes\n    output_types = model_params.output_types\n\n    input_names = [f\"input_{i}\" for i in range(len(input_tensors))]\n    output_names = [f\"output_{i}\" for i in range(len(output_sizes))]\n    dynamic_info = model_params.dynamic_info\n\n    if dynamic_info is not None:\n        # This check is needed to enable backward compatibility with\n        # previous versions of nebullvm\n        if isinstance(list(dynamic_info.inputs[0].values())[0], str):\n            onnx_format_inputs = dynamic_info.inputs\n        else:\n            onnx_format_inputs = [\n                {k: v[\"name\"] for (k, v) in d.items()}\n                for d in dynamic_info.inputs\n            ]\n\n        assert len(dynamic_info.outputs) == len(output_names), (\n            f\"The number of dynamic outputs provided in the dynamic info \"\n            f\"dict ({len(dynamic_info.outputs)}) is not equal to the number \"\n            f\"of outputs of the model ({len(output_names)}), Detected model \"\n            f\"output shapes are: {output_sizes} \"\n        )\n\n        dynamic_info = {\n            name: dynamic_dict\n            for name, dynamic_dict in zip(\n                input_names + output_names,\n                onnx_format_inputs + dynamic_info.outputs,\n            )\n        }\n\n    try:\n        # try conversion with model on cpu\n        if device.type is DeviceType.GPU:\n            input_tensors = [x.cpu() for x in input_tensors]\n            torch_model.cpu()\n\n        torch.onnx.export(\n            torch_model,  # model being run\n            tuple(\n                input_tensors\n            ),  # model input (or a tuple for multiple inputs)\n            str(output_file_path),\n            # where to save the model (can be a file or file-like object)\n            export_params=True,\n            # store the trained parameter weights inside the model file\n            opset_version=ONNX_OPSET_VERSION,\n            # the ONNX version to export the model to\n            do_constant_folding=True,\n            # whether to execute constant folding for optimization\n            input_names=input_names,\n            # the model's input names\n            output_names=output_names,\n            # the model's output names\n            dynamic_axes=dynamic_info,\n        )\n\n        # Put again model on gpu\n        if device.type is DeviceType.GPU:\n            torch_model.to(device.to_torch_format())\n\n        return output_file_path\n    except Exception:\n        # try conversion with model on gpu\n        if device.type is DeviceType.GPU:\n            input_tensors = [\n                x.to(device.to_torch_format()) for x in input_tensors\n            ]\n            torch_model.to(device.to_torch_format())\n\n            try:\n                with torch.autocast(\"cuda\") if output_types[\n                    0\n                ] is DataType.FLOAT16 else nullcontext():\n                    torch.onnx.export(\n                        torch_model,  # model being run\n                        tuple(\n                            input_tensors\n                        ),  # model input (or a tuple for multiple inputs)\n                        str(output_file_path),\n                        # where to save the model\n                        # (can be a file or file-like object)\n                        export_params=True,\n                        # store the trained parameter weights inside the model\n                        opset_version=ONNX_OPSET_VERSION,\n                        # the ONNX version to export the model to\n                        do_constant_folding=True,\n                        # whether to execute constant folding for optimization\n                        input_names=input_names,\n                        # the model's input names\n                        output_names=output_names,\n                        # the model's output names\n                        dynamic_axes=dynamic_info,\n                    )\n\n                return output_file_path\n            except Exception:\n                logger.warning(\n                    \"Exception raised during conversion from torch\"\n                    \" to onnx model. ONNX pipeline will be unavailable.\"\n                )\n                return None\n        else:\n            logger.warning(\n                \"Exception raised during conversion from torch\"\n                \" to onnx model. ONNX pipeline will be unavailable.\"\n            )\n            return None\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/conversions/tensorflow.py",
    "content": "import subprocess\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Union\n\nfrom loguru import logger\n\nfrom nebullvm.config import ONNX_OPSET_VERSION\nfrom nebullvm.core.models import ModelParams\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf, tf2onnx\nfrom nebullvm.optional_modules.onnx import onnx\nfrom nebullvm.tools.huggingface import TensorFlowTransformerWrapper\n\n\ndef convert_tf_to_onnx(\n    model: Union[tf.Module, tf.keras.Model],\n    model_params: ModelParams,\n    output_file_path: Union[str, Path],\n):\n    \"\"\"Convert TF models into ONNX.\n\n    Args:\n        model (Union[tf.Module, tf.keras.Model]): TF model.\n        model_params (ModelParams): Info about model parameters.\n        output_file_path (Path): Path where storing the output file.\n    \"\"\"\n\n    try:\n        if isinstance(model, tf.keras.Model) or (\n            isinstance(model, TensorFlowTransformerWrapper)\n            and isinstance(model.core_model, tf.keras.Model)\n        ):\n            return convert_keras_to_onnx(model, model_params, output_file_path)\n        else:\n            return convert_tf_saved_model_to_onnx(model, output_file_path)\n    except Exception:\n        logger.warning(\n            \"Something went wrong during conversion from tensorflow\"\n            \" to onnx model. ONNX pipeline will be unavailable.\"\n        )\n        return None\n\n\ndef convert_tf_saved_model_to_onnx(\n    model: tf.Module, output_file_path: Union[str, Path]\n):\n    \"\"\"Convert TF models into ONNX.\n    Args:\n        model (tf.Module): TF model.\n        output_file_path (Path): Path where storing the output file.\n    \"\"\"\n    with TemporaryDirectory() as temp_dir:\n        tf.saved_model.save(model, export_dir=temp_dir)\n\n        try:\n            subprocess.check_output([\"python3\", \"--version\"])\n            python_cmd = \"python3\"\n        except subprocess.CalledProcessError:\n            python_cmd = \"python\"\n\n        onnx_cmd = [\n            python_cmd,\n            \"-m\",\n            \"tf2onnx.convert\",\n            \"--saved-model\",\n            f\"{temp_dir}\",\n            \"--output\",\n            f\"{output_file_path}\",\n            \"--opset\",\n            f\"{ONNX_OPSET_VERSION}\",\n        ]\n        subprocess.run(onnx_cmd)\n        onnx.load(output_file_path)\n\n    return output_file_path\n\n\ndef convert_keras_to_onnx(\n    model: tf.keras.Model,\n    model_params: ModelParams,\n    output_file_path: Union[str, Path],\n):\n    \"\"\"Convert keras models into ONNX.\n\n    Args:\n        model (tf.keras.Model): keras model.\n        model_params (ModelParams): Model Parameters as input sizes and\n            dynamic axis information.\n        output_file_path (Path): Path where storing the output file.\n    \"\"\"\n    # get data types for each input\n    dtypes = [\n        model_params.input_infos[i].dtype.value\n        for i in range(len(model_params.input_infos))\n    ]\n    # get input shapes for each input\n    shapes = [\n        [int(x) for x in model_params.input_infos[i].size]\n        for i in range(len(model_params.input_infos))\n    ]\n    # set the dynamic axes for each input\n    if isinstance(model, TensorFlowTransformerWrapper):\n        names = list(model.inputs_types.keys())\n    else:\n        names = [f\"input_{i}\" for i in range(len(model_params.input_infos))]\n\n    input_signature = tuple(\n        tf.TensorSpec(\n            (\n                None\n                if model_params.dynamic_info is not None\n                and dim in model_params.dynamic_info.inputs[i]\n                else shape[dim]\n                for dim in range(len(shape))\n            ),\n            dtype,\n            name=name,\n        )\n        for i, (shape, dtype, name) in enumerate(zip(shapes, dtypes, names))\n    )\n\n    onnx_model, _ = tf2onnx.convert.from_keras(\n        model,\n        input_signature,\n        opset=ONNX_OPSET_VERSION,\n        output_path=output_file_path,\n    )\n\n    return output_file_path\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/conversions/utils.py",
    "content": "from nebullvm.core.models import DeepLearningFramework\nfrom nebullvm.operations.conversions.converters import (\n    PytorchConverter,\n    TensorflowConverter,\n    ONNXConverter,\n    Converter,\n)\n\n\ndef get_conversion_op(framework: DeepLearningFramework) -> Converter:\n    if framework == DeepLearningFramework.PYTORCH:\n        conversion_op = PytorchConverter()\n    elif framework == DeepLearningFramework.TENSORFLOW:\n        conversion_op = TensorflowConverter()\n    else:\n        conversion_op = ONNXConverter()\n\n    return conversion_op\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/fetch_operations/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/fetch_operations/local.py",
    "content": "from typing import Any, Union, Iterable, Sequence\n\nfrom nebullvm.operations.base import Operation\n\n\nclass FetchModelFromLocal(Operation):\n    def execute(self, model: Any):\n        self.state[\"model\"] = model\n\n    def get_model(self) -> any:\n        return self.state.get(\"model\")\n\n    def get_result(self) -> Any:\n        pass\n\n\nclass FetchDataFromLocal(Operation):\n    def execute(self, data: Union[Iterable, Sequence]):\n        self.state[\"data\"] = data\n\n    def get_data(self) -> any:\n        return self.state.get(\"data\")\n\n    def get_result(self) -> Any:\n        pass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/base.py",
    "content": "import json\nimport os\nimport shutil\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass, InitVar\nfrom pathlib import Path\nfrom tempfile import mkdtemp, TemporaryDirectory\nfrom typing import Union, Dict, Any, List, Optional\n\nimport numpy as np\n\nfrom nebullvm.config import LEARNER_METADATA_FILENAME\nfrom nebullvm.core.models import ModelParams, Device, QuantizationType\nfrom nebullvm.operations.base import Operation\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.onnx import create_model_inputs_onnx\nfrom nebullvm.tools.pytorch import (\n    create_model_inputs_torch,\n    get_torch_model_size,\n)\nfrom nebullvm.tools.tf import create_model_inputs_tf\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass BuildInferenceLearner(Operation, ABC):\n    def __init__(self):\n        super().__init__()\n        self.inference_learner = None\n\n    @abstractmethod\n    def execute(self, **kwargs):\n        raise NotImplementedError()\n\n    def get_result(self) -> Any:\n        return self.inference_learner\n\n\n@dataclass\nclass BaseInferenceLearner(ABC):\n    \"\"\"Base class for Inference Learners.\"\"\"\n\n    network_parameters: ModelParams\n    input_tfms: Optional[MultiStageTransformation] = None\n    input_data: InitVar[List[Any]] = None\n    device: Device = None\n    quantization_type: QuantizationType = None\n\n    @property\n    @abstractmethod\n    def name(self) -> str:\n        \"\"\"The name of the InferenceLearner\"\"\"\n\n    def __post_init__(self, input_data):\n        if self.input_tfms is not None and len(self.input_tfms) < 0:\n            self.input_tfms = None\n        self._tmp_folder = Path(mkdtemp())\n        self._input_data = input_data\n\n    def _store_file(self, file_path: Union[str, Path]):\n        return shutil.copy(str(file_path), str(self._tmp_folder))\n\n    def _store_dir(self, dir_path: Union[str, Path]):\n        try:\n            # For python >= 3.8\n            return shutil.copytree(\n                str(dir_path), str(self._tmp_folder), dirs_exist_ok=True\n            )\n        except TypeError:\n            # For python <=3.7\n            if os.path.isdir(self._tmp_folder):\n                shutil.rmtree(str(self._tmp_folder))\n            return shutil.copytree(str(dir_path), str(self._tmp_folder))\n\n    def __del__(self, shutil=shutil):\n        try:\n            shutil.rmtree(self._tmp_folder, ignore_errors=True)\n        except Exception:\n            pass\n\n    def predict_from_files(\n        self, input_files: List[str], output_files: List[str]\n    ):\n        \"\"\"Get a model prediction from file.\n\n        The input file is read, processed and a prediction is run on top of it.\n        The prediction is then returned into another file (in the same\n        directory of the input file itself).\n\n        Args:\n            input_files (List[str]): List of paths to the input file.\n            output_files (List[str]): List of paths to the file storing\n                the prediction.\n        \"\"\"\n        inputs = (self._read_file(input_file) for input_file in input_files)\n        preds = self(*inputs)\n        for pred, output_file in zip(preds, output_files):\n            self._save_file(pred, output_file)\n\n    def predict_from_listified_tensors(self, *listified_tensors: List):\n        \"\"\"Predict from listified tensor.\n\n        Method useful to be used in services receiving the input tensor\n        from an HTTP call.\n\n        Args:\n            listified_tensors (List): List of list-like version of the\n                input tensors. Note that each element of the external list is\n                a listified input tensor.\n\n        Returns:\n            List: List of list-like predictions.\n        \"\"\"\n        inputs = (\n            self.list2tensor(listified_tensor)\n            for listified_tensor in listified_tensors\n        )\n        if self.input_tfms is not None:\n            inputs = (self.input_tfms(_input) for _input in inputs)\n        preds = self.predict(*inputs)\n        return [self.tensor2list(pred) for pred in preds]\n\n    def list2tensor(self, listified_tensor: List) -> Any:\n        \"\"\"Convert list to tensor.\n\n        Args:\n            listified_tensor (List): Listified version of the input tensor.\n\n        Returns:\n            Any: Tensor for the prediction.\n        \"\"\"\n        raise NotImplementedError()\n\n    def tensor2list(self, tensor: Any) -> List:\n        \"\"\"Convert tensor to list.\n\n        Args:\n            tensor (any): Input tensor.\n\n        Returns:\n            List: Listified version of the tensor.\n        \"\"\"\n        raise NotImplementedError()\n\n    def _read_file(self, input_file: str) -> Any:\n        \"\"\"Read tensor from file.\n        Args:\n            input_file (str): Path to the file containing the input tensor.\n\n        Returns:\n            Any: Tensor read from the file.\n        \"\"\"\n        raise NotImplementedError()\n\n    def _save_file(self, prediction: Any, output_file: str):\n        \"\"\"Save prediction in the appropriate format.\n\n        Args:\n            prediction (any): The predicted tensor.\n            output_file (str): Path to the file where storing the prediction.\n        \"\"\"\n        raise NotImplementedError\n\n    def predict(self, *args, **kwargs) -> Any:\n        \"\"\"Take as input a tensor and returns a prediction\"\"\"\n        out = self(*args, **kwargs)\n\n        # TensorFlow predict method must return a np array\n        if isinstance(out[0], tf.Tensor):\n            out = tuple(t.numpy() for t in out)\n\n        return out\n\n    @abstractmethod\n    def run(self, *args, **kwargs) -> Any:\n        \"\"\"Abstract method implementing the prediction code.\"\"\"\n        raise NotImplementedError()\n\n    def forward(self, *args, **kwargs):\n        \"\"\"Alternative method to the predict one.\"\"\"\n        return self(*args, **kwargs)\n\n    def __call__(self, *args, **kwargs):\n        if self.input_tfms is not None:\n            args = (self.input_tfms(_input) for _input in args)\n        return self.run(*args, **kwargs)\n\n    def save(self, path: Union[str, Path], **kwargs):\n        \"\"\"Save the model.\n\n        Args:\n            path (Path): Path to the directory where saving the model.\n        \"\"\"\n        raise NotImplementedError()\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        \"\"\"Load the model.\n\n        Args:\n            path (Path): Path to the directory where the model is stored.\n\n        Returns:\n            BaseInferenceLearner: Loaded model.\n        \"\"\"\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_size(self):\n        \"\"\"The function returns the size of the optimized model.\"\"\"\n        raise NotImplementedError()\n\n    @abstractmethod\n    def free_gpu_memory(self):\n        \"\"\"The function cleans the gpu occupied by the inference learner.\"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    def get_inputs_example(self):\n        \"\"\"The function returns an example of the input for the optimized\n        model predict method.\n        \"\"\"\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def output_format(self):\n        return \".txt\"\n\n    @property\n    @abstractmethod\n    def input_format(self):\n        return \".txt\"\n\n\nclass LearnerMetadata:\n    \"\"\"Class for storing all the metadata about a model.\n\n    The stored information can be used for loading the appropriate model.\n\n    Attributes:\n        class_name (str): Name of the model class. For instance, for the model\n            object `CustomModel()`, the class name is 'CustomModel'.\n        module_name (str): Path to the python module where the model class\n            is defined.\n        network_parameters (Dict): Dictionaty containing the network\n            parameters, i.e. batch_size, input_size and output_size.\n        kwargs: External attributes that will be stored in the Metadata file.\n    \"\"\"\n\n    NAME: str = LEARNER_METADATA_FILENAME\n    class_name: str\n    module_name: str\n    device: str\n    quantization_type: str\n\n    def __init__(\n        self,\n        class_name: str,\n        module_name: str,\n        network_parameters: Union[ModelParams, Dict],\n        input_tfms: Union[MultiStageTransformation, Dict] = None,\n        **kwargs,\n    ):\n        self.class_name = class_name\n        self.module_name = module_name\n        self.network_parameters = (\n            network_parameters.dict()\n            if isinstance(network_parameters, ModelParams)\n            else network_parameters\n        )\n        self.input_tfms = (\n            input_tfms.to_dict()\n            if isinstance(input_tfms, MultiStageTransformation)\n            else input_tfms\n        )\n        self.__dict__.update(**kwargs)\n\n    def __getitem__(self, item):\n        if not isinstance(item, str):\n            raise TypeError(\n                f\"Error in key type. Expected str got {type(item)}\"\n            )\n        elif item.startswith(\"_\"):\n            raise ValueError(\"Trying to access a private attribute.\")\n        return self.__dict__.get(item)\n\n    @classmethod\n    def from_model(cls, model: BaseInferenceLearner, **kwargs):\n        \"\"\"Create the metadata from the Inference Learner.\n\n        Args:\n            model (BaseInferenceLearner): Model from which extract the\n                metadata.\n            kwargs: External attributes that will be stored in the Metadata\n                file.\n\n        Returns:\n            LearnerMetadata: Metadata associated with the model.\n        \"\"\"\n        return cls(\n            class_name=model.__class__.__name__,\n            module_name=model.__module__,\n            network_parameters=model.network_parameters,\n            input_tfms=model.input_tfms,\n            device=model.device.type.value\n            if model.device is not None\n            else None,\n            quantization_type=model.quantization_type.value\n            if model.quantization_type is not None\n            else None,\n            **kwargs,\n        )\n\n    @classmethod\n    def from_dict(cls, dictionary: Dict):\n        \"\"\"Create the metadata file from a dictionary.\n\n        This method is the reverse one of `to_dict`.\n\n        Args:\n            dictionary (Dict): Dictionary containing the metadata.\n\n        Returns:\n            LearnerMetadata: Metadata associated with the model.\n        \"\"\"\n        if any(\n            key not in dictionary\n            for key in (\"class_name\", \"module_name\", \"network_parameters\")\n        ):\n            raise ValueError(\n                \"The input dictionary should contain both the model class \"\n                \"name and module.\"\n            )\n        return cls(**dictionary)\n\n    def to_dict(self) -> Dict:\n        \"\"\"Method for converting the LearnerMetadata in a python dictionary.\n\n        Returns:\n            Dict: Dictionary containing the metadata.\n        \"\"\"\n        return {\n            key: value\n            for key, value in self.__dict__.items()\n            if (\n                len(key) > 0\n                and key[0].islower()\n                and not key.startswith(\"_\")\n                and value is not None\n            )\n        }\n\n    @classmethod\n    def read(cls, path: Union[Path, str]):\n        \"\"\"Read the metadata file and store it into a LearnerMetadata object.\n\n        Args:\n            path (Path): Path to the directory containing the metadata file.\n\n        Returns:\n            LearnerMetadata: Metadata associated with the model.\n        \"\"\"\n        path = Path(path)\n        with open(path / cls.NAME, \"r\") as fin:\n            metadata_dict = json.load(fin)\n        return cls(**metadata_dict)\n\n    def save(self, path: Union[Path, str]):\n        \"\"\"Save the metadata of the model in a file.\n\n        Args:\n            path (Path): Path to the directory where saving the model metadata.\n        \"\"\"\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n        metadata_dict = self.to_dict()\n        with open(path / self.NAME, \"w\") as fout:\n            json.dump(metadata_dict, fout)\n\n    def load_model(\n        self, path: Union[Path, str], **kwargs\n    ) -> BaseInferenceLearner:\n        \"\"\"Method for loading the InferenceLearner from its metadata.\n\n        The ModelMetadata file contains all the information necessary for\n        loading the Learner, as it contains both the module where the model\n        is defined and the class name of the model object. This method calls\n        the appropriate class method of the Model object, thus the actual\n        model loading is delegate to its methods.\n\n        Args:\n            path (Path): Path to the directory containing the files where\n                the model optimization is saved.\n            kwargs: Dictionary containing the arguments for the model's load\n                function.\n        \"\"\"\n        exec(f\"from {self.module_name} import {self.class_name}\")\n        model = eval(self.class_name).load(path=path, **kwargs)\n        return model\n\n\nclass PytorchBaseInferenceLearner(BaseInferenceLearner, ABC):\n    @property\n    def input_format(self):\n        return \".pt\"\n\n    @property\n    def output_format(self):\n        return \".pt\"\n\n    def list2tensor(self, listified_tensor: List) -> torch.Tensor:\n        \"\"\"Convert list to tensor.\n\n        Args:\n            listified_tensor (List): Listified version of the input tensor.\n\n        Returns:\n            torch.Tensor: Tensor for the prediction.\n        \"\"\"\n        return torch.tensor(listified_tensor)\n\n    def tensor2list(self, tensor: torch.Tensor) -> List:\n        \"\"\"Convert tensor to list.\n\n        Args:\n            tensor (any): Input tensor.\n\n        Returns:\n            List: Listified version of the tensor.\n        \"\"\"\n        return tensor.cpu().detach().numpy().tolist()\n\n    def free_gpu_memory(self):\n        self.model.cpu()\n        self._is_gpu_ready = False\n\n    def set_model_on_gpu(self):\n        self.model.to(self.device.to_torch_format())\n        self._is_gpu_ready = True\n\n    def _read_file(self, input_file: Union[str, Path]) -> torch.Tensor:\n        input_tensor = torch.load(input_file)\n        return input_tensor\n\n    def _save_file(\n        self, prediction: torch.Tensor, output_file: Union[str, Path]\n    ):\n        torch.save(prediction, output_file)\n\n    def get_inputs_example(self, random=False):\n        if self._input_data is None or random:\n            return tuple(\n                create_model_inputs_torch(\n                    input_infos=self.network_parameters.input_infos,\n                )\n            )\n        else:\n            return self._input_data\n\n    def get_size(self):\n        try:\n            if hasattr(self.model, \"core_model\"):\n                return get_torch_model_size(self.model.core_model)\n            else:\n                # Normal torch model\n                return get_torch_model_size(self.model)\n        except RuntimeError:\n            with TemporaryDirectory() as tmp_dir:\n                self.save(tmp_dir)\n                return sum(\n                    os.path.getsize(Path(tmp_dir) / f)\n                    for f in os.listdir(Path(tmp_dir))\n                    if os.path.isfile(Path(tmp_dir) / f)\n                )\n\n\nclass TensorflowBaseInferenceLearner(BaseInferenceLearner, ABC):\n    @property\n    def input_format(self):\n        return \".npy\"\n\n    @property\n    def output_format(self):\n        return \".npy\"\n\n    def free_gpu_memory(self):\n        tf.keras.backend.clear_session()\n        self._is_gpu_ready = False\n\n    def set_model_on_gpu(self):\n        self._is_gpu_ready = True\n\n    def list2tensor(self, listified_tensor: List) -> tf.Tensor:\n        \"\"\"Convert list to tensor.\n\n        Args:\n            listified_tensor (List): Listified version of the input tensor.\n\n        Returns:\n            tf.Tensor: Tensor ready to be used for prediction.\n        \"\"\"\n        return tf.convert_to_tensor(listified_tensor)\n\n    def tensor2list(self, tensor: tf.Tensor) -> List:\n        \"\"\"Convert tensor to list.\n\n        Args:\n            tensor (tf.Tensor): Input tensor.\n\n        Returns:\n            List: Listified version of the tensor.\n        \"\"\"\n        return tensor.numpy().tolist()\n\n    def _read_file(self, input_file: Union[str, Path]) -> tf.Tensor:\n        numpy_array = np.load(input_file)\n        input_tensor = tf.convert_to_tensor(numpy_array)\n        return input_tensor\n\n    def _save_file(self, prediction: tf.Tensor, output_file: Union[str, Path]):\n        prediction.numpy().save(output_file)\n\n    def get_inputs_example(self, random=False):\n        if self._input_data is None or random:\n            return tuple(\n                create_model_inputs_tf(\n                    input_infos=self.network_parameters.input_infos,\n                )\n            )\n        else:\n            return self._input_data\n\n\nclass NumpyBaseInferenceLearner(BaseInferenceLearner, ABC):\n    @property\n    def input_format(self):\n        return \".npy\"\n\n    @property\n    def output_format(self):\n        return \".npy\"\n\n    def list2tensor(self, listified_tensor: List) -> np.ndarray:\n        \"\"\"Convert list to numpy arrays.\n\n        Args:\n            listified_tensor (List): Listified version of the input tensor.\n\n        Returns:\n            np.array: Tensor ready to be used for prediction.\n        \"\"\"\n        return np.array(listified_tensor)\n\n    def tensor2list(self, tensor: np.ndarray) -> List:\n        \"\"\"Convert tensor to list.\n\n        Args:\n            tensor (tf.Tensor): Input tensor.\n\n        Returns:\n            List: Listified version of the tensor.\n        \"\"\"\n        return tensor.tolist()\n\n    def _read_file(self, input_file: Union[str, Path]) -> np.ndarray:\n        numpy_array = np.load(input_file)\n        return numpy_array\n\n    def _save_file(\n        self, prediction: np.ndarray, output_file: Union[str, Path]\n    ):\n        np.save(output_file, prediction)\n\n    def get_inputs_example(self, random=False):\n        if self._input_data is None or random:\n            return tuple(\n                create_model_inputs_onnx(\n                    input_infos=self.network_parameters.input_infos,\n                )\n            )\n        else:\n            return self._input_data\n\n\nclass InferenceLearnerWrapper(BaseInferenceLearner, ABC):\n    \"\"\"Wrapper model around InferenceLearners. It's a base class: cannot be\n    instantiated.\n\n    For all the BaseInferenceLearner-related methods, the implementation of\n    the core model will be used. This class just re-implement the load and save\n    methods, allowing (and forcing) then the child class to re-implement the\n    `predict` method.\n\n    Attributes:\n        network_parameters (ModelParams): Model parameters.\n        core_inference_learner (BaseInferenceLearner): Inference Learner.\n    \"\"\"\n\n    CORE_MODEL_SAVE_DIR = \"core_model\"\n\n    def __init__(self, core_inference_learner: BaseInferenceLearner):\n        super().__init__(\n            network_parameters=core_inference_learner.network_parameters\n        )\n        self.core_inference_learner = core_inference_learner\n\n    def list2tensor(self, listified_tensor: List) -> Any:\n        return self.core_inference_learner.list2tensor(listified_tensor)\n\n    def tensor2list(self, tensor: Any) -> List:\n        return self.core_inference_learner.tensor2list(tensor)\n\n    def _read_file(self, input_file: str) -> Any:\n        return self.core_inference_learner._read_file(input_file)\n\n    def _save_file(self, prediction: Any, output_file: str):\n        self.core_inference_learner._save_file(prediction, output_file)\n\n    def save(self, path: Union[str, Path], **kwargs):\n        core_model_path = Path(path) / self.CORE_MODEL_SAVE_DIR\n        core_model_path.mkdir(exist_ok=True, parents=True)\n        self.core_inference_learner.save(core_model_path, **kwargs)\n        extra_metadata_kwargs = self._get_extra_metadata_kwargs()\n        metadata = LearnerMetadata.from_model(self, **extra_metadata_kwargs)\n        metadata.save(path)\n        self._save_wrapper_extra_info()\n\n    def _get_extra_metadata_kwargs(self) -> Dict:\n        raise NotImplementedError\n\n    def _save_wrapper_extra_info(self):\n        raise NotImplementedError\n\n    @staticmethod\n    def _convert_metadata_to_inputs(metadata: LearnerMetadata) -> Dict:\n        raise NotImplementedError\n\n    @staticmethod\n    def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict:\n        raise NotImplementedError\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        core_model_path = Path(path) / cls.CORE_MODEL_SAVE_DIR\n        core_learner = LearnerMetadata.read(core_model_path).load_model(\n            core_model_path, **kwargs\n        )\n        metadata = LearnerMetadata.read(path)\n        input_dict = cls._convert_metadata_to_inputs(metadata)\n        input_dict = cls._load_wrapper_extra_info(input_dict)\n        input_dict.update({\"core_inference_learner\": core_learner})\n        return cls(**input_dict)\n\n    def free_gpu_memory(self):\n        return self.core_inference_learner.free_gpu_memory()\n\n    def get_inputs_example(self):\n        return self.core_inference_learner.get_inputs_example()\n\n    @property\n    def output_format(self):\n        return self.core_inference_learner.output_format\n\n    @property\n    def input_format(self):\n        return self.core_inference_learner.input_format\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/blade_disc.py",
    "content": "from typing import Optional\n\nfrom nebullvm.core.models import ModelParams, Device\nfrom nebullvm.operations.inference_learners.torchscript import (\n    TorchScriptInferenceLearner,\n)\nfrom nebullvm.optional_modules.torch import ScriptModule\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass BladeDISCInferenceLearner(TorchScriptInferenceLearner):\n    name = \"BladeDISC\"\n\n    @classmethod\n    def from_torch_model(\n        cls,\n        model: ScriptModule,\n        network_parameters: ModelParams,\n        device: Device,\n        input_tfms: Optional[MultiStageTransformation] = None,\n        input_data: DataManager = None,\n    ):\n        return cls(\n            torch_model=model,\n            network_parameters=network_parameters,\n            input_tfms=input_tfms,\n            input_data=input_data,\n            device=device,\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/builders.py",
    "content": "from pathlib import Path\nfrom typing import Any, Union\n\nfrom nebullvm.core.models import (\n    ModelParams,\n    DeepLearningFramework,\n    QuantizationType,\n    DeviceType,\n)\nfrom nebullvm.operations.inference_learners.base import BuildInferenceLearner\nfrom nebullvm.operations.inference_learners.deepsparse import (\n    PytorchDeepSparseInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.faster_transformer import (\n    FasterTransformerInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.neural_compressor import (\n    PytorchNeuralCompressorInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.onnx import ONNX_INFERENCE_LEARNERS\nfrom nebullvm.operations.inference_learners.openvino import (\n    OPENVINO_INFERENCE_LEARNERS,\n)\nfrom nebullvm.operations.inference_learners.tensor_rt import (\n    TENSOR_RT_INFERENCE_LEARNERS,\n    PytorchTensorRTInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tensorflow import (\n    TensorflowBackendInferenceLearner,\n    TFLiteBackendInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.torch_dynamo import (\n    TorchDynamoInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.torch_neuron import (\n    TorchNeuronInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.torch_xla import (\n    TorchXLAInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.torchscript import (\n    TorchScriptInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tvm import (\n    APACHE_TVM_INFERENCE_LEARNERS,\n    PytorchApacheTVMInferenceLearner,\n)\nfrom nebullvm.optional_modules.tensor_rt import tensorrt as trt\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import (\n    ScriptModule,\n    Module,\n    GraphModule,\n    torch,\n)\nfrom nebullvm.optional_modules.tvm import tvm, ExecutorFactoryModule\nfrom nebullvm.tools.onnx import get_input_names, get_output_names\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n    VerifyContiguity,\n)\n\n\nclass TorchScriptBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: ScriptModule,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        self.inference_learner = TorchScriptInferenceLearner(\n            torch_model=model,\n            network_parameters=model_params,\n            input_tfms=input_tfms,\n            device=self.device,\n        )\n\n\nclass TorchXLABuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: torch.nn.Module,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        self.inference_learner = TorchXLAInferenceLearner(\n            torch_model=model,\n            network_parameters=model_params,\n            input_tfms=input_tfms,\n            device=self.device,\n        )\n\n\nclass TorchNeuronBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: ScriptModule,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        self.inference_learner = TorchNeuronInferenceLearner(\n            torch_model=model,\n            network_parameters=model_params,\n            input_tfms=input_tfms,\n            device=self.device,\n        )\n\n\nclass TorchDynamoBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: ScriptModule,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        self.inference_learner = TorchDynamoInferenceLearner(\n            torch_model=model,\n            network_parameters=model_params,\n            input_tfms=input_tfms,\n            device=self.device,\n        )\n\n\nclass TensorflowBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: tf.Module,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        self.inference_learner = TensorflowBackendInferenceLearner(\n            model,\n            network_parameters=model_params,\n            input_tfms=input_tfms,\n            device=self.device,\n        )\n\n\nclass TFLiteBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: bytes,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        self.inference_learner = TFLiteBackendInferenceLearner(\n            model,\n            network_parameters=model_params,\n            input_tfms=input_tfms,\n            device=self.device,\n        )\n\n\nclass DeepSparseBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: Union[str, Path],\n        model_params: ModelParams,\n        **kwargs,\n    ):\n        input_names = get_input_names(str(model))\n        output_names = get_output_names(str(model))\n\n        self.inference_learner = PytorchDeepSparseInferenceLearner(\n            onnx_path=model,\n            network_parameters=model_params,\n            input_names=input_names,\n            output_names=output_names,\n            device=self.device,\n        )\n\n\nclass ONNXBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: Union[str, Path],\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        source_dl_framework: DeepLearningFramework,\n        quantization_type: QuantizationType,\n        **kwargs,\n    ):\n        input_names = get_input_names(str(model))\n        output_names = get_output_names(str(model))\n\n        self.inference_learner = ONNX_INFERENCE_LEARNERS[source_dl_framework](\n            onnx_path=model,\n            network_parameters=model_params,\n            input_names=input_names,\n            output_names=output_names,\n            input_tfms=input_tfms,\n            device=self.device,\n            quantization_type=quantization_type,\n        )\n\n\nclass OpenVINOBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: str,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        source_dl_framework: DeepLearningFramework,\n        **kwargs,\n    ):\n        self.inference_learner = OPENVINO_INFERENCE_LEARNERS[\n            source_dl_framework\n        ].from_model_name(\n            model_name=model + \".xml\",\n            model_weights=model + \".bin\",\n            input_tfms=input_tfms,\n            network_parameters=model_params,\n            device=self.device,\n        )\n\n\nclass PyTorchTensorRTBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: ScriptModule,\n        input_tfms: MultiStageTransformation,\n        model_params: ModelParams,\n        **kwargs,\n    ):\n        self.inference_learner = PytorchTensorRTInferenceLearner(\n            torch_model=model,\n            input_tfms=input_tfms,\n            network_parameters=model_params,\n            device=self.device,\n        )\n\n\nclass ONNXTensorRTBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: Any,\n        model_orig: Union[str, Path],\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        source_dl_framework: DeepLearningFramework,\n        **kwargs,\n    ):\n        nvidia_logger = trt.Logger(trt.Logger.ERROR)\n        input_names = get_input_names(str(model_orig))\n        output_names = get_output_names(str(model_orig))\n\n        input_tfms.append(VerifyContiguity())\n        runtime = trt.Runtime(nvidia_logger)\n        engine = runtime.deserialize_cuda_engine(model)\n\n        self.inference_learner = TENSOR_RT_INFERENCE_LEARNERS[\n            source_dl_framework\n        ](\n            engine=engine,\n            input_tfms=input_tfms,\n            network_parameters=model_params,\n            input_names=input_names,\n            output_names=output_names,\n            nvidia_logger=nvidia_logger,\n            device=self.device,\n        )\n\n\nclass IntelNeuralCompressorBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: GraphModule,\n        model_orig: Module,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        self.inference_learner = PytorchNeuralCompressorInferenceLearner(\n            model=model_orig,\n            model_quant=model,\n            input_tfms=input_tfms,\n            network_parameters=model_params,\n            device=self.device,\n        )\n\n\nclass PyTorchApacheTVMBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: ExecutorFactoryModule,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        target_device = (\n            str(tvm.target.cuda())\n            if self.device.type is DeviceType.GPU\n            else \"llvm\"\n        )\n        dev = tvm.device(str(target_device), 0)\n\n        input_names = [\n            f\"input_{i}\" for i in range(len(model_params.input_infos))\n        ]\n\n        graph_executor_module = tvm.contrib.graph_executor.GraphModule(\n            model[\"default\"](dev)\n        )\n        self.inference_learner = PytorchApacheTVMInferenceLearner(\n            input_tfms=input_tfms,\n            network_parameters=model_params,\n            graph_executor_module=graph_executor_module,\n            input_names=input_names,\n            lib=model,\n            target=target_device,\n            device=self.device,\n        )\n\n\nclass ONNXApacheTVMBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: ExecutorFactoryModule,\n        model_orig: str,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        source_dl_framework: DeepLearningFramework,\n        **kwargs,\n    ):\n        target_device = (\n            str(tvm.target.cuda())\n            if self.device.type is DeviceType.GPU\n            else \"llvm\"\n        )\n        dev = tvm.device(str(target_device), 0)\n\n        input_names = (\n            get_input_names(model_orig)\n            if model_orig is not None\n            else [f\"input_{i}\" for i in range(len(model_params.input_infos))]\n        )\n\n        graph_executor_module = tvm.contrib.graph_executor.GraphModule(\n            model[\"default\"](dev)\n        )\n        self.inference_learner = APACHE_TVM_INFERENCE_LEARNERS[\n            source_dl_framework\n        ](\n            input_tfms=input_tfms,\n            network_parameters=model_params,\n            graph_executor_module=graph_executor_module,\n            input_names=input_names,\n            lib=model,\n            target=target_device,\n            device=self.device,\n        )\n\n\nclass FasterTransformerBuildInferenceLearner(BuildInferenceLearner):\n    def execute(\n        self,\n        model: ScriptModule,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation,\n        **kwargs,\n    ):\n        self.inference_learner = FasterTransformerInferenceLearner(\n            torch_model=model,\n            network_parameters=model_params,\n            input_tfms=input_tfms,\n            device=self.device,\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/deepsparse.py",
    "content": "import os\nimport shutil\nfrom abc import ABC\nfrom pathlib import Path\nfrom typing import Union, List, Generator, Tuple, Dict, Type\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.config import ONNX_FILENAMES\nfrom nebullvm.core.models import Device, ModelParams, DeepLearningFramework\nfrom nebullvm.operations.inference_learners.base import (\n    BaseInferenceLearner,\n    LearnerMetadata,\n    PytorchBaseInferenceLearner,\n)\nfrom nebullvm.optional_modules.deepsparse import cpu, compile_model\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass DeepSparseInferenceLearner(BaseInferenceLearner, ABC):\n    \"\"\"Model optimized on CPU using DeepSparse. DeepSparse is an engine\n    accelerating sparse computations on CPUs.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        onnx_path (str or Path): Path to the onnx model.\n        input_names (List[str]): Input names used when the onnx model\n            was produced.\n        output_names (List[str]): Output names used when the onnx model\n            was produced.\n    \"\"\"\n\n    name = \"DeepSparse\"\n\n    def __init__(\n        self,\n        onnx_path: Union[str, Path],\n        input_names: List[str],\n        output_names: List[str],\n        device: Device,\n        **kwargs,\n    ):\n        super().__init__(**kwargs)\n        self.onnx_path = self._store_file(onnx_path)\n\n        # Compile model\n        cores_per_socket, _, _ = cpu.cpu_details()\n        # Define the number of cores to use, by default it will make use of\n        # all physical cores on the system\n        num_cores = cores_per_socket\n        batch_size = kwargs[\"network_parameters\"].batch_size\n        self.engine = compile_model(onnx_path, batch_size, num_cores)\n\n        self.input_names = input_names\n        self.output_names = output_names\n        self.device = device\n\n    def get_size(self):\n        return os.path.getsize(self.onnx_path)\n\n    def save(self, path: Union[str, Path], **kwargs):\n        \"\"\"Save the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model will\n                be stored.\n            kwargs (Dict): Dictionary of key-value pairs that will be saved in\n                the model metadata file.\n        \"\"\"\n        metadata = LearnerMetadata.from_model(\n            self,\n            input_names=self.input_names,\n            output_names=self.output_names,\n            **kwargs,\n        )\n        metadata.save(path)\n\n        shutil.copy(\n            self.onnx_path,\n            Path(path) / ONNX_FILENAMES[\"model_name\"],\n        )\n\n    def free_gpu_memory(self):\n        raise NotImplementedError(\"DeepSparse does not support GPU inference.\")\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        \"\"\"Load the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model is\n                stored.\n            kwargs (Dict): Dictionary of additional arguments for consistency\n                with other Learners.\n\n        Returns:\n            DeepSparseInferenceLearner: The optimized model.\n        \"\"\"\n        if len(kwargs) > 0:\n            logger.warning(\n                f\"No extra keywords expected for the load method. \"\n                f\"Got {kwargs}.\"\n            )\n        onnx_path = os.path.join(str(path), ONNX_FILENAMES[\"model_name\"])\n        metadata = LearnerMetadata.read(path)\n        input_tfms = metadata.input_tfms\n        if input_tfms is not None:\n            input_tfms = MultiStageTransformation.from_dict(\n                metadata.input_tfms\n            )\n        device = Device.from_str(metadata.device)\n        return cls(\n            input_tfms=input_tfms,\n            network_parameters=ModelParams(**metadata.network_parameters),\n            onnx_path=onnx_path,\n            input_names=metadata[\"input_names\"],\n            output_names=metadata[\"output_names\"],\n            device=device,\n        )\n\n    def _predict_arrays(self, input_arrays: Generator[np.ndarray, None, None]):\n        inputs = [array for array in input_arrays]\n        outputs = self.engine(inputs)\n        return outputs\n\n\nclass PytorchDeepSparseInferenceLearner(\n    DeepSparseInferenceLearner, PytorchBaseInferenceLearner\n):\n    \"\"\"Model optimized on CPU using DeepSparse. DeepSparse is an engine\n    accelerating sparse computations on CPUs.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        onnx_path (str or Path): Path to the onnx model.\n        input_names (List[str]): Input names used when the onnx model\n            was produced.\n        output_names (List[str]): Output names used when the onnx model\n            was produced.\n    \"\"\"\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        input_arrays = (\n            input_tensor.cpu().detach().numpy()\n            for input_tensor in input_tensors\n        )\n        outputs = self._predict_arrays(input_arrays)\n        return tuple(torch.from_numpy(output) for output in outputs)\n\n\nDEEPSPARSE_INFERENCE_LEARNERS: Dict[\n    DeepLearningFramework, Type[DeepSparseInferenceLearner]\n] = {DeepLearningFramework.PYTORCH: PytorchDeepSparseInferenceLearner}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/faster_transformer.py",
    "content": "from nebullvm.operations.inference_learners.torchscript import (\n    TorchScriptInferenceLearner,\n)\n\n\nclass FasterTransformerInferenceLearner(TorchScriptInferenceLearner):\n    MODEL_NAME = \"faster_transformer_model_scripted.pt\"\n    name = \"FasterTransformer\"\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/huggingface.py",
    "content": "from abc import ABC\nfrom collections import OrderedDict\nfrom pathlib import Path\nfrom typing import List, Any, Dict, Union\n\nfrom nebullvm.operations.inference_learners.base import (\n    InferenceLearnerWrapper,\n    PytorchBaseInferenceLearner,\n    LearnerMetadata,\n    BaseInferenceLearner,\n)\nfrom nebullvm.optional_modules.diffusers import StableDiffusionPipeline\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.diffusers import postprocess_diffusers\nfrom nebullvm.tools.huggingface import restructure_output\nfrom nebullvm.tools.pytorch import get_torch_model_size\n\n\nclass HuggingFaceInferenceLearner(InferenceLearnerWrapper):\n    \"\"\"Class wrapping an InferenceLearner model and giving to it the\n    huggingface interface.\n\n    The class fuse both the InterfaceLearner and HuggingFace interfaces, giving\n    to the final user a model which can be used whit the prefered API without\n    the need of adapting the previous code.\n\n    Attributes:\n        network_parameters (ModelParams): Model parameters of the model.\n        core_inference_learner (PytorchBaseInferenceLearner): Inference learner\n            built using the Pytorch interface.\n        output_structure (Dict): Original output structure of the HuggingFace\n            model.\n        input_names (List[str]): List of all the input keys used for the\n            original HuggingFace model.\n        output_type (Any, optional): Original output type of the HuggingFace\n            model.\n    \"\"\"\n\n    @property\n    def name(self) -> str:\n        return self.core_inference_learner.name\n\n    def __init__(\n        self,\n        core_inference_learner: PytorchBaseInferenceLearner,\n        output_structure: OrderedDict,\n        input_names: List[str],\n        output_type: Any = None,\n    ):\n        super().__init__(core_inference_learner)\n        self.output_structure = output_structure\n        self.input_names = input_names\n        self.output_type = output_type\n\n    def _save_wrapper_extra_info(self):\n        pass\n\n    def get_size(self):\n        return self.core_inference_learner.get_size()\n\n    @staticmethod\n    def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict:\n        return builder_inputs\n\n    def run(self, *args, **kwargs) -> Any:\n        \"\"\"Run the underlying optimized model for getting a prediction.\n\n        The method has an hybrid interface. It accepts inputs either as\n        positional or keyword arguments. If only positional arguments are given\n        the method expects the inputs to be in the canonical\n        nebullvm interface. If only keyword arguments are given the method\n        expects them to be in the HuggingFace interface. Mixed representation\n        is not allowed and will result in an error.\n        \"\"\"\n        if len(args) > 0 and len(kwargs) > 0:\n            raise RuntimeError(\n                \"Not allowed usage of the predict method. \"\n                \"Either the positional or the keyword arguments must be given.\"\n            )\n        if len(args) > 0:\n            return self.core_inference_learner(*args)\n        inputs = (kwargs.pop(name) for name in self.input_names)\n        outputs = self.core_inference_learner(*inputs)\n\n        if self.output_type is tuple:\n            return outputs\n        else:\n            return restructure_output(\n                outputs, self.output_structure, self.output_type\n            )\n\n    def _get_extra_metadata_kwargs(self) -> Dict:\n        metadata_kwargs = {\n            \"output_structure\": self.output_structure,\n            \"output_structure_keys\": list(self.output_structure.keys()),\n            \"input_names\": self.input_names,\n        }\n        if self.output_type is not None:\n            metadata_kwargs.update(\n                {\n                    \"output_type\": self.output_type.__name__,\n                    \"output_type_module\": self.output_type.__module__,\n                }\n            )\n        return metadata_kwargs\n\n    @staticmethod\n    def _convert_metadata_to_inputs(metadata: LearnerMetadata) -> Dict:\n        # we need to guarantee the preservation of the output structure\n        # elements order.\n        output_structure = OrderedDict()\n        for key in metadata[\"output_structure_keys\"]:\n            output_structure[key] = metadata[\"output_structure\"][key]\n\n        inputs = {\n            \"output_structure\": output_structure,\n            \"input_names\": metadata[\"input_names\"],\n        }\n        if metadata[\"output_type\"] is not None:\n            exec(\n                f\"from {metadata['output_type_module']} \"\n                f\"import {metadata['output_type']}\"\n            )\n            inputs[\"output_type\"] = eval(metadata[\"output_type\"])\n        return inputs\n\n\nclass DiffusionInferenceLearner(BaseInferenceLearner, ABC):\n    @property\n    def name(self) -> str:\n        return self.pipeline.unet.model.name\n\n    def __init__(self, pipeline: StableDiffusionPipeline):\n        self.pipeline = pipeline\n\n    def __call__(self, *args, **kwargs):\n        return self.pipeline(*args, **kwargs)\n\n    def run(self, *args, **kwargs) -> Any:\n        self.pipeline(*args, **kwargs)\n\n    def save(self, path: Union[str, Path], **kwargs):\n        self.pipeline.unet.model.save(path)\n\n    @classmethod\n    def load(\n        cls,\n        path: Union[Path, str],\n        **kwargs,\n    ):\n        try:\n            pipe = kwargs[\"pipe\"]\n        except KeyError:\n            raise TypeError(\"Missing required argument 'pipe'\")\n        optimized_model = LearnerMetadata.read(path).load_model(path)\n        return postprocess_diffusers(\n            optimized_model,\n            pipe,\n            optimized_model.device,\n        )\n\n    def get_size(self):\n        (\n            self.pipeline.unet.model.get_size()\n            + sum(\n                [\n                    get_torch_model_size(v)\n                    for (k, v) in self.pipeline.__dict__.items()\n                    if isinstance(v, torch.nn.Module) and k != \"unet\"\n                ]\n            )\n            / 1e6\n        )\n\n    def free_gpu_memory(self):\n        raise self.pipeline.unet.model.free_gpu_memory()\n\n    def get_inputs_example(self):\n        raise NotImplementedError()\n\n    @property\n    def output_format(self):\n        return \".pt\"\n\n    @property\n    def input_format(self):\n        return \".pt\"\n\n    def list2tensor(self, listified_tensor: List) -> Any:\n        raise NotImplementedError()\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/neural_compressor.py",
    "content": "from abc import ABC\nfrom pathlib import Path\nfrom typing import Union, Tuple, Dict, Type\n\nfrom loguru import logger\n\nfrom nebullvm.core.models import Device, ModelParams, DeepLearningFramework\nfrom nebullvm.operations.inference_learners.base import (\n    BaseInferenceLearner,\n    LearnerMetadata,\n    PytorchBaseInferenceLearner,\n)\nfrom nebullvm.optional_modules.neural_compressor import (\n    cfgs_to_fx_cfgs,\n    cfg_to_qconfig,\n)\nfrom nebullvm.optional_modules.torch import (\n    torch,\n    prepare_fx,\n    convert_fx,\n    Module,\n    GraphModule,\n)\nfrom nebullvm.tools.pytorch import (\n    save_with_torch_fx,\n    load_with_torch_fx,\n    create_model_inputs_torch,\n    get_torch_model_size,\n)\nfrom nebullvm.tools.transformations import MultiStageTransformation\nfrom nebullvm.tools.utils import check_module_version\n\n\nclass NeuralCompressorInferenceLearner(BaseInferenceLearner, ABC):\n    \"\"\"Model optimized on CPU using IntelNeuralCompressor.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        model (torch.fx.GraphModule): Torch fx graph model.\n    \"\"\"\n\n    name = \"IntelNeuralCompressor\"\n\n    def __init__(\n        self,\n        model: Union[Module, GraphModule],\n        model_quant: GraphModule,\n        device: Device,\n        **kwargs,\n    ):\n        super().__init__(**kwargs)\n        self.model = model\n        self.model_quant = model_quant\n        self.device = device\n\n    def get_size(self):\n        return get_torch_model_size(self.model_quant) + get_torch_model_size(\n            self.model\n        )\n\n    def save(self, path: Union[str, Path], **kwargs):\n        \"\"\"Save the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model will\n                be stored.\n            kwargs (Dict): Dictionary of key-value pairs that will be saved in\n                the model metadata file.\n        \"\"\"\n        metadata = LearnerMetadata.from_model(self, **kwargs)\n        metadata.save(path)\n\n        path_orig_model = Path(path) / Path(\"model_orig\")\n        path_quant_model = Path(path) / Path(\"model_quant\")\n\n        save_with_torch_fx(self.model, path_orig_model)\n        self.model_quant.save(str(path_quant_model))\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        \"\"\"Load the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model is\n                stored.\n            kwargs (Dict): Dictionary of additional arguments for consistency\n                with other Learners.\n\n        Returns:\n            DeepSparseInferenceLearner: The optimized model.\n        \"\"\"\n        if len(kwargs) > 0:\n            logger.warning(\n                f\"No extra keywords expected for the load method. \"\n                f\"Got {kwargs}.\"\n            )\n\n        metadata = LearnerMetadata.read(path)\n        input_tfms = metadata.input_tfms\n        if input_tfms is not None:\n            input_tfms = MultiStageTransformation.from_dict(\n                metadata.input_tfms\n            )\n\n        network_parameters = ModelParams(**metadata.network_parameters)\n\n        path_orig_model = Path(path) / Path(\"model_orig\")\n        path_quant_model = Path(path) / Path(\"model_quant\") / \"best_model.pt\"\n\n        model = load_with_torch_fx(\n            Path(path_orig_model), \"state_dict.pt\"\n        ).eval()\n        state_dict = torch.load(path_quant_model)\n\n        tune_cfg = state_dict.pop(\"best_configure\")\n        op_cfgs = cfg_to_qconfig(tune_cfg, tune_cfg[\"approach\"])\n        fx_op_cfgs = cfgs_to_fx_cfgs(op_cfgs, tune_cfg[\"approach\"])\n\n        additional_arguments = {}\n        if check_module_version(torch, min_version=\"1.13.0\"):\n            additional_arguments[\"example_inputs\"] = tuple(\n                create_model_inputs_torch(\n                    input_infos=network_parameters.input_infos,\n                )\n            )\n\n        q_model = prepare_fx(\n            model,\n            fx_op_cfgs,\n            **additional_arguments,\n        )\n        q_model = convert_fx(q_model)\n\n        q_model.load_state_dict(state_dict)\n        device = Device.from_str(metadata.device)\n\n        return cls(\n            model=model,\n            model_quant=q_model,\n            device=device,\n            input_tfms=input_tfms,\n            network_parameters=ModelParams(**metadata.network_parameters),\n        )\n\n\nclass PytorchNeuralCompressorInferenceLearner(\n    NeuralCompressorInferenceLearner, PytorchBaseInferenceLearner\n):\n    \"\"\"Model optimized on CPU using IntelNeuralCompressor.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        model (torch.fx.GraphModule): Torch fx graph model.\n    \"\"\"\n\n    def free_gpu_memory(self):\n        raise NotImplementedError(\n            \"NeuralCompressor does not support GPU inference.\"\n        )\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        inputs = (t.cpu() for t in input_tensors)\n        outputs = self.model_quant(*inputs)\n\n        if isinstance(outputs, torch.Tensor):\n            outputs = (outputs,)\n\n        return outputs\n\n\nNEURAL_COMPRESSOR_INFERENCE_LEARNERS: Dict[\n    DeepLearningFramework, Type[NeuralCompressorInferenceLearner]\n] = {DeepLearningFramework.PYTORCH: PytorchNeuralCompressorInferenceLearner}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/onnx.py",
    "content": "import multiprocessing\nimport os\nimport shutil\nfrom abc import ABC\nfrom pathlib import Path\nfrom typing import Union, List, Generator, Tuple, Dict, Type\n\nimport cpuinfo\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.config import (\n    ONNX_FILENAMES,\n    ONNX_PROVIDERS,\n)\nfrom nebullvm.core.models import (\n    QuantizationType,\n    Device,\n    DeviceType,\n    ModelParams,\n    DeepLearningFramework,\n)\nfrom nebullvm.operations.inference_learners.base import (\n    BaseInferenceLearner,\n    LearnerMetadata,\n    PytorchBaseInferenceLearner,\n    TensorflowBaseInferenceLearner,\n    NumpyBaseInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    tensorrt_is_available,\n)\nfrom nebullvm.optional_modules.onnx import onnx\nfrom nebullvm.optional_modules.onnxruntime import onnxruntime as ort\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\ndef _running_on_intel_cpu(use_gpu):\n    if use_gpu:\n        return False  # running on GPU\n    cpu_info = cpuinfo.get_cpu_info()[\"brand_raw\"].lower()\n    if \"intel\" in cpu_info:\n        return True\n    return False\n\n\ndef _get_ort_session_options(use_gpu) -> ort.SessionOptions:\n    sess_options = ort.SessionOptions()\n    sess_options.graph_optimization_level = (\n        ort.GraphOptimizationLevel.ORT_ENABLE_ALL\n    )\n    if not use_gpu:\n        sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL\n        sess_options.inter_op_num_threads = 1\n        sess_options.intra_op_num_threads = max(\n            int(\n                os.environ.get(\"NEBULLVM_THREADS_PER_MODEL\")\n                or multiprocessing.cpu_count()\n            ),\n            1,\n        )\n    return sess_options\n\n\nclass ONNXInferenceLearner(BaseInferenceLearner, ABC):\n    \"\"\"Model converted to ONNX and run with Microsoft's onnxruntime.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        onnx_path (str or Path): Path to the onnx model.\n        input_names (List[str]): Input names used when the onnx model\n            was produced.\n        output_names (List[str]): Output names used when the onnx model\n            was produced.\n    \"\"\"\n\n    name = \"ONNXRuntime\"\n\n    def __init__(\n        self,\n        onnx_path: Union[str, Path],\n        input_names: List[str],\n        output_names: List[str],\n        device: Device,\n        quantization_type: QuantizationType,\n        **kwargs,\n    ):\n        super().__init__(**kwargs)\n        filename = Path(onnx_path).name\n        dir_path = str(Path(onnx_path).parent)\n        self.device = device\n\n        self.onnx_path = Path(self._store_dir(dir_path)) / filename\n        self.sess_options = _get_ort_session_options(\n            self.device.type is DeviceType.GPU\n        )\n        self.quantization_type = quantization_type\n\n        if _running_on_intel_cpu(self.device.type is DeviceType.GPU):\n            self.sess_options.add_session_config_entry(\n                \"session.set_denormal_as_zero\", \"1\"\n            )\n\n        self.set_model_on_gpu()\n\n        self._is_gpu_ready = self.device.type is DeviceType.GPU\n        self.input_names = input_names\n        self.output_names = output_names\n\n    @staticmethod\n    def _setup_tensorrt(quantization_type: QuantizationType, device: Device):\n        if (\n            tensorrt_is_available()\n            and os.environ.get(\"LD_LIBRARY_PATH\", False)\n            and \"tensorrt\" in os.environ[\"LD_LIBRARY_PATH\"]\n        ):\n            ONNX_PROVIDERS[\"cuda\"][0] = (\n                \"TensorrtExecutionProvider\",\n                {\n                    \"device_id\": device.idx,\n                    \"trt_max_workspace_size\": device.get_free_memory(),\n                    \"trt_fp16_enable\": True\n                    if quantization_type is not None\n                    else False,\n                    \"trt_int8_enable\": True\n                    if quantization_type is QuantizationType.STATIC\n                    else False,\n                },\n            )\n        else:\n            if tensorrt_is_available():\n                logger.warning(\n                    \"TensorrtExecutionProvider for onnx is not \"\n                    \"available. If you want to use it, please  \"\n                    \"add the path to tensorrt to the \"\n                    \"LD_LIBRARY_PATH environment variable. \"\n                    \"CUDA provider will be used instead. \"\n                )\n            else:\n                logger.warning(\n                    \"TensorRT is not available. \"\n                    \"If you want to use it, please install it and \"\n                    \"add the path to the LD_LIBRARY_PATH \"\n                    \"environment variable.\"\n                    \"CUDA provider will be used instead. \"\n                )\n            if \"TensorrtExecutionProvider\" in ONNX_PROVIDERS[\"cuda\"]:\n                ONNX_PROVIDERS[\"cuda\"].remove(\"TensorrtExecutionProvider\")\n\n    def get_size(self):\n        return sum(\n            os.path.getsize(self.onnx_path.parents[0] / f)\n            for f in os.listdir(self.onnx_path.parents[0])\n            if os.path.isfile(self.onnx_path.parents[0] / f)\n        )\n\n    def free_gpu_memory(self):\n        del self._session\n        self._is_gpu_ready = False\n\n    def set_model_on_gpu(self):\n        if (\n            self.device.type is DeviceType.GPU\n            and len(ONNX_PROVIDERS[\"cuda\"]) == 3\n        ):\n            ONNX_PROVIDERS[\"cuda\"][1] = (\n                \"CUDAExecutionProvider\",\n                {\n                    \"device_id\": self.device.idx,\n                },\n            )\n            self._setup_tensorrt(self.quantization_type, self.device)\n\n        ort_session = ort.InferenceSession(\n            str(self.onnx_path),\n            sess_options=self.sess_options,\n            providers=ONNX_PROVIDERS[\"cuda\"]\n            if self.device.type is DeviceType.GPU\n            else ONNX_PROVIDERS[\"cpu\"],\n        )\n        self._session = ort_session\n        self._is_gpu_ready = True\n\n    def save(self, path: Union[str, Path], **kwargs):\n        \"\"\"Save the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model will\n                be stored.\n            kwargs (Dict): Dictionary of key-value pairs that will be saved in\n                the model metadata file.\n        \"\"\"\n        metadata = LearnerMetadata.from_model(\n            self,\n            input_names=self.input_names,\n            output_names=self.output_names,\n            **kwargs,\n        )\n\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n\n        metadata.save(path)\n\n        shutil.copy(\n            self.onnx_path,\n            os.path.join(str(path), ONNX_FILENAMES[\"model_name\"]),\n        )\n\n        try:\n            # Tries to load the model\n            onnx.load(os.path.join(str(path), ONNX_FILENAMES[\"model_name\"]))\n        except FileNotFoundError:\n            # If missing files, it means it's saved in onnx external_data\n            # format\n            src_dir = str(Path(self.onnx_path).parent)\n            files = os.listdir(src_dir)\n            for fname in files:\n                if \".onnx\" not in fname:\n                    shutil.copy2(\n                        os.path.join(src_dir, fname), os.path.join(path, fname)\n                    )\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        \"\"\"Load the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model is\n                stored.\n            kwargs (Dict): Dictionary of additional arguments for consistency\n                with other Learners.\n\n        Returns:\n            ONNXInferenceLearner: The optimized model.\n        \"\"\"\n        if len(kwargs) > 0:\n            logger.warning(\n                f\"No extra keywords expected for the load method. \"\n                f\"Got {kwargs}.\"\n            )\n        path = Path(path)\n        onnx_path = path / ONNX_FILENAMES[\"model_name\"]\n        metadata = LearnerMetadata.read(path)\n        input_tfms = metadata.input_tfms\n        device = Device.from_str(metadata.device)\n        quantization_type = (\n            QuantizationType(metadata.quantization_type)\n            if hasattr(metadata, \"quantization_type\")\n            else None\n        )\n        if input_tfms is not None:\n            input_tfms = MultiStageTransformation.from_dict(\n                metadata.input_tfms\n            )\n        return cls(\n            input_tfms=input_tfms,\n            network_parameters=ModelParams(**metadata.network_parameters),\n            onnx_path=onnx_path,\n            input_names=metadata[\"input_names\"],\n            output_names=metadata[\"output_names\"],\n            device=device,\n            quantization_type=quantization_type,\n        )\n\n    def _predict_arrays(self, input_arrays: Generator[np.ndarray, None, None]):\n        input_dict = {\n            name: input_array\n            for name, input_array in zip(self.input_names, input_arrays)\n        }\n        outputs = self._session.run(self.output_names, input_dict)\n        return outputs\n\n\nclass PytorchONNXInferenceLearner(\n    ONNXInferenceLearner, PytorchBaseInferenceLearner\n):\n    \"\"\"Model run with Microsoft's onnxruntime using a Pytorch interface.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        onnx_path (str or Path): Path to the onnx model.\n        input_names (List[str]): Input names used when the onnx model\n            was produced.\n        output_names (List[str]): Output names used when the onnx model\n            was produced.\n    \"\"\"\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:\n            self.set_model_on_gpu()\n        input_arrays = (\n            input_tensor.cpu().detach().numpy()\n            for input_tensor in input_tensors\n        )\n        outputs = self._predict_arrays(input_arrays)\n        return tuple(\n            torch.from_numpy(output).to(self.device.to_torch_format())\n            for output in outputs\n        )\n\n\nclass TensorflowONNXInferenceLearner(\n    ONNXInferenceLearner, TensorflowBaseInferenceLearner\n):\n    \"\"\"Model run with Microsoft's onnxruntime using a tensorflow interface.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        onnx_path (str or Path): Path to the onnx model.\n        input_names (List[str]): Input names used when the onnx model\n            was produced.\n        output_names (List[str]): Output names used when the onnx model\n            was produced.\n    \"\"\"\n\n    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:\n            self.set_model_on_gpu()\n        input_arrays = (\n            input_tensor.numpy()\n            if not isinstance(input_tensor, np.ndarray)\n            else input_tensor\n            for input_tensor in input_tensors\n        )\n        outputs = self._predict_arrays(input_arrays)\n        # noinspection PyTypeChecker\n        return tuple(tf.convert_to_tensor(output) for output in outputs)\n\n\nclass NumpyONNXInferenceLearner(\n    ONNXInferenceLearner, NumpyBaseInferenceLearner\n):\n    \"\"\"Model run with Microsoft's onnxruntime using a numpy interface.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        onnx_path (str or Path): Path to the onnx model.\n        input_names (List[str]): Input names used when the onnx model\n            was produced.\n        output_names (List[str]): Output names used when the onnx model\n            was produced.\n    \"\"\"\n\n    def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[np.ndarray, ...]): Input tensors belonging to\n                the same batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:\n            self.set_model_on_gpu()\n        input_arrays = (input_tensor for input_tensor in input_tensors)\n        outputs = self._predict_arrays(input_arrays)\n        return tuple(outputs)\n\n\nONNX_INFERENCE_LEARNERS: Dict[\n    DeepLearningFramework, Type[ONNXInferenceLearner]\n] = {\n    DeepLearningFramework.PYTORCH: PytorchONNXInferenceLearner,\n    DeepLearningFramework.TENSORFLOW: TensorflowONNXInferenceLearner,\n    DeepLearningFramework.NUMPY: NumpyONNXInferenceLearner,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/openvino.py",
    "content": "import json\nimport shutil\nfrom abc import ABC\nfrom pathlib import Path\nfrom typing import Dict, Union, Type, Generator, Tuple, List, Optional\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.config import OPENVINO_FILENAMES\nfrom nebullvm.core.models import Device, ModelParams, DeepLearningFramework\nfrom nebullvm.operations.inference_learners.base import (\n    BaseInferenceLearner,\n    LearnerMetadata,\n    PytorchBaseInferenceLearner,\n    TensorflowBaseInferenceLearner,\n    NumpyBaseInferenceLearner,\n)\nfrom nebullvm.optional_modules.openvino import (\n    Core,\n    Model,\n    CompiledModel,\n    InferRequest,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass OpenVinoInferenceLearner(BaseInferenceLearner, ABC):\n    \"\"\"Model optimized using OpenVINO.\n\n    The class cannot be directly instantiated, but implements all the core\n    methods needed for using OpenVINO at inference time.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        exec_network (any): The graph executor. This is the\n            central component in the OpenVino optimized model execution.\n        input_keys (List): Keys associated to the inputs.\n        output_keys (List): Keys associated to the outputs.\n        description_file (str): File containing a description of the optimized\n            model.\n        weights_file (str): File containing the model weights.\n    \"\"\"\n\n    MODEL_NAME = \"model.bin\"\n    name = \"OpenVINO\"\n\n    def __init__(\n        self,\n        compiled_model: CompiledModel,\n        infer_request: InferRequest,\n        input_keys: List,\n        output_keys: List,\n        description_file: str,\n        weights_file: str,\n        device: Device,\n        **kwargs,\n    ):\n        super().__init__(**kwargs)\n        self.compiled_model = compiled_model\n        self.infer_request = infer_request\n        self.input_keys = input_keys\n        self.output_keys = output_keys\n        self.device = device\n        self.description_file = self._store_file(description_file)\n        self.weights_file = self._store_file(weights_file)\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        \"\"\"Load the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model is\n                stored.\n            kwargs (Dict): Dictionary of additional arguments for the\n                `from_model_name` class method.\n\n        Returns:\n            OpenVinoInferenceLearner: The optimized model.\n        \"\"\"\n        path = Path(path)\n\n        with open(path / OPENVINO_FILENAMES[\"metadata\"], \"r\") as fin:\n            metadata = json.load(fin)\n        metadata.update(kwargs)\n        metadata[\"network_parameters\"] = ModelParams(\n            **metadata[\"network_parameters\"]\n        )\n        input_tfms = metadata.get(\"input_tfms\")\n        if input_tfms is not None:\n            metadata[\"input_tfms\"] = MultiStageTransformation.from_dict(\n                input_tfms\n            )\n\n        model_name = str(path / OPENVINO_FILENAMES[\"description_file\"])\n        model_weights = str(path / OPENVINO_FILENAMES[\"weights\"])\n        metadata[\"device\"] = Device.from_str(metadata[\"device\"])\n        return cls.from_model_name(\n            model_name=model_name, model_weights=model_weights, **metadata\n        )\n\n    def get_size(self):\n        return len(self.compiled_model.export_model())\n\n    def free_gpu_memory(self):\n        raise NotImplementedError(\"OpenVino does not support GPU inference.\")\n\n    @classmethod\n    def from_model_name(\n        cls,\n        network_parameters: ModelParams,\n        model_name: str,\n        model_weights: str,\n        device: Device,\n        input_tfms: MultiStageTransformation = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Build the optimized model from the network description and its\n        weights.\n\n        Args:\n            network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n            model_name (str): File containing a description of the optimized\n                model.\n            model_weights (str): File containing the model weights.\n            device (Device): Device used to run the model.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction.\n            input_data (DataManager, optional): User defined data.\n        \"\"\"\n        if len(kwargs) > 0:\n            logger.warning(f\"Found extra parameters: {kwargs}\")\n\n        core = Core()\n        model = core.read_model(model=model_name, weights=model_weights)\n\n        dynamic_shape = cls._get_dynamic_shape(model, network_parameters)\n\n        if dynamic_shape is not None:\n            model.reshape(dynamic_shape)\n\n        compiled_model = core.compile_model(model=model, device_name=\"CPU\")\n        infer_request = compiled_model.create_infer_request()\n\n        input_keys = list(\n            map(lambda obj: obj.get_any_name(), compiled_model.inputs)\n        )\n        output_keys = list(\n            map(lambda obj: obj.get_any_name(), compiled_model.outputs)\n        )\n\n        return cls(\n            compiled_model,\n            infer_request,\n            input_keys,\n            output_keys,\n            input_tfms=input_tfms,\n            network_parameters=network_parameters,\n            description_file=model_name,\n            weights_file=model_weights,\n            input_data=input_data,\n            device=device,\n        )\n\n    @staticmethod\n    def _get_dynamic_shape(\n        model: Model, network_parameters: ModelParams\n    ) -> Optional[Dict[str, Tuple[int]]]:\n        if network_parameters.dynamic_info is None:\n            return None\n\n        input_names = [\n            list(model_input.names)[0] for model_input in model.inputs\n        ]\n        input_shapes = [\n            input_info.size for input_info in network_parameters.input_infos\n        ]\n        dynamic_shapes = []\n\n        assert len(input_shapes) == len(\n            network_parameters.dynamic_info.inputs\n        ), (\n            f\"Number of inputs defined in dynamic info \"\n            f\"({len(input_shapes)}) is different from the one \"\n            f\"expected from the model \"\n            f\"({len(network_parameters.dynamic_info.inputs)}).\"\n        )\n\n        for input_shape, dynamic_shape_dict in zip(\n            input_shapes, network_parameters.dynamic_info.inputs\n        ):\n            input_shape = list(input_shape)\n            for key in dynamic_shape_dict.keys():\n                input_shape[int(key)] = -1\n            dynamic_shapes.append(tuple(input_shape))\n\n        dynamic_shape_dict = {\n            k: v for k, v in zip(input_names, dynamic_shapes)\n        }\n        return dynamic_shape_dict\n\n    def _get_metadata(self, **kwargs) -> LearnerMetadata:\n        # metadata = {\n        #     key: self.__dict__[key] for key in (\"input_keys\", \"output_keys\")\n        # }\n        metadata = {}\n        metadata.update(kwargs)\n        return LearnerMetadata.from_model(self, **metadata)\n\n    def save(self, path: Union[str, Path], **kwargs):\n        \"\"\"Save the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model will\n                be stored.\n            kwargs (Dict): Dictionary of key-value pairs that will be saved in\n                the model metadata file.\n        \"\"\"\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n        metadata = self._get_metadata(**kwargs)\n\n        metadata.save(path)\n\n        shutil.copy(\n            self.description_file,\n            path / OPENVINO_FILENAMES[\"description_file\"],\n        )\n        shutil.copy(self.weights_file, path / OPENVINO_FILENAMES[\"weights\"])\n\n    def _predict_array(\n        self,\n        input_arrays: Generator[np.ndarray, None, None],\n    ) -> Generator[np.ndarray, None, None]:\n\n        results = self.infer_request.infer(\n            inputs={\n                input_key: input_array\n                for input_key, input_array in zip(\n                    self.input_keys, input_arrays\n                )\n            }\n        )\n        results = {\n            output_key.get_any_name(): output_arr\n            for output_key, output_arr in results.items()\n        }\n\n        return (results[output_key] for output_key in self.output_keys)\n\n\nclass PytorchOpenVinoInferenceLearner(\n    OpenVinoInferenceLearner, PytorchBaseInferenceLearner\n):\n    \"\"\"Model optimized using ApacheTVM with a Pytorch interface.\n\n    This class can be used exactly in the same way as a pytorch Module object.\n    At prediction time it takes as input pytorch tensors given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        exec_network (any): The graph executor. This is the\n            central component in the OpenVino optimized model execution.\n        input_keys (List): Keys associated to the inputs.\n        output_keys (List): Keys associated to the outputs.\n        description_file (str): File containing a description of the optimized\n            model.\n        weights_file (str): File containing the model weights.\n    \"\"\"\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        input_arrays = (\n            input_tensor.cpu().detach().numpy()\n            for input_tensor in input_tensors\n        )\n        output_arrays = self._predict_array(input_arrays)\n        return tuple(\n            torch.from_numpy(output_array) for output_array in output_arrays\n        )\n\n\nclass TensorflowOpenVinoInferenceLearner(\n    OpenVinoInferenceLearner, TensorflowBaseInferenceLearner\n):\n    \"\"\"Model optimized using ApacheTVM with a tensorflow interface.\n\n    This class can be used exactly in the same way as a tf.Module or\n    keras.Model object.\n    At prediction time it takes as input tensorflow tensors given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        exec_network (any): The graph executor. This is the\n            central component in the OpenVino optimized model execution.\n        input_keys (List): Keys associated to the inputs.\n        output_keys (List): Keys associated to the outputs.\n        description_file (str): File containing a description of the optimized\n            model.\n        weights_file (str): File containing the model weights.\n    \"\"\"\n\n    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        input_arrays = (input_tensor.numpy() for input_tensor in input_tensors)\n        output_arrays = self._predict_array(input_arrays)\n        # noinspection PyTypeChecker\n        return tuple(\n            tf.convert_to_tensor(output_array)\n            for output_array in output_arrays\n        )\n\n\nclass NumpyOpenVinoInferenceLearner(\n    OpenVinoInferenceLearner, NumpyBaseInferenceLearner\n):\n    \"\"\"Model optimized using ApacheTVM with a numpy interface.\n\n    This class can be used exactly in the same way as a sklearn or\n    numpy-based model.\n    At prediction time it takes as input numpy arrays given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        exec_network (any): The graph executor. This is the\n            central component in the OpenVino optimized model execution.\n        input_keys (List): Keys associated to the inputs.\n        output_keys (List): Keys associated to the outputs.\n        description_file (str): File containing a description of the optimized\n            model.\n        weights_file (str): File containing the model weights.\n    \"\"\"\n\n    def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[np.ndarray]): Input tensors belonging to\n                the same batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[np.ndarray]: Output tensors. Note that the output tensors\n                does not correspond to the prediction on the input tensors\n                with a 1 to 1 mapping. In fact the output tensors are produced\n                as the multiple-output of the model given a (multi-) tensor\n                input.\n        \"\"\"\n        input_arrays = (input_tensor for input_tensor in input_tensors)\n        output_arrays = self._predict_array(input_arrays)\n        return tuple(output_arrays)\n\n\nOPENVINO_INFERENCE_LEARNERS: Dict[\n    DeepLearningFramework, Type[OpenVinoInferenceLearner]\n] = {\n    DeepLearningFramework.PYTORCH: PytorchOpenVinoInferenceLearner,\n    DeepLearningFramework.TENSORFLOW: TensorflowOpenVinoInferenceLearner,\n    DeepLearningFramework.NUMPY: NumpyOpenVinoInferenceLearner,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/tensor_rt.py",
    "content": "import json\nimport os\nfrom abc import ABC\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Any, Union, Dict, Type, List, Tuple, Generator, Optional\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.config import NVIDIA_FILENAMES\nfrom nebullvm.core.models import (\n    Device,\n    DeviceType,\n    ModelParams,\n    DeepLearningFramework,\n)\nfrom nebullvm.operations.inference_learners.base import (\n    BaseInferenceLearner,\n    LearnerMetadata,\n    PytorchBaseInferenceLearner,\n    TensorflowBaseInferenceLearner,\n    NumpyBaseInferenceLearner,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.tensor_rt import tensorrt as trt, polygraphy\nfrom nebullvm.optional_modules.torch import torch, ScriptModule\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n    VerifyContiguity,\n)\n\n\nclass ONNXTensorRTInferenceLearner(BaseInferenceLearner, ABC):\n    \"\"\"Model optimized using TensorRT.\n\n    The class cannot be directly instantiated, but implements all the core\n    methods needed for using TensorRT at inference time.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        engine (any): The tensorRT engine.\n        input_names (List[str]): Names associated to the model input tensors.\n        output_names (List[str]): Names associated to the model output tensors.\n        cuda_stream (any, optional): Stream used for communication with Nvidia\n            GPUs.\n        nvidia_logger (any, optional): Logger used by the Nvidia service\n    \"\"\"\n\n    name = \"TensorRT\"\n\n    def __init__(\n        self,\n        engine: Any,\n        input_names: List[str],\n        output_names: List[str],\n        device: Device,\n        cuda_stream: Any = None,\n        nvidia_logger: Any = None,\n        **kwargs,\n    ):\n        super().__init__(**kwargs)\n        self.engine = engine\n        self.context = self.engine.create_execution_context()\n        self.input_names = input_names\n        self.output_names = output_names\n        self.cuda_stream = cuda_stream\n        self.nvidia_logger = nvidia_logger\n        self.output_tensors = None\n        self.device = device\n        self._set_cuda_env(device.type is DeviceType.GPU)\n\n    def _get_metadata(self, **kwargs) -> LearnerMetadata:\n        metadata = {\n            key: self.__dict__[key] for key in (\"input_names\", \"output_names\")\n        }\n        metadata.update(kwargs)\n        return LearnerMetadata.from_model(self, **metadata)\n\n    def _synchronize_stream(self):\n        raise NotImplementedError()\n\n    @property\n    def stream_ptr(self):\n        raise NotImplementedError()\n\n    @staticmethod\n    def _get_default_cuda_stream() -> Any:\n        raise NotImplementedError()\n\n    @staticmethod\n    def check_env(use_gpu):\n        if not use_gpu:\n            raise SystemError(\n                \"You are trying to run an optimizer developed for NVidia gpus \"\n                \"on a machine not connected to any GPU supporting CUDA.\"\n            )\n\n    def _set_cuda_env(self, use_gpu):\n        self.check_env(use_gpu)\n        if self.nvidia_logger is None:\n            self.nvidia_logger = trt.Logger(trt.Logger.WARNING)\n        if self.cuda_stream is None:\n            self.cuda_stream = self._get_default_cuda_stream()\n\n    @classmethod\n    def from_engine_path(\n        cls,\n        network_parameters: ModelParams,\n        engine_path: Union[str, Path],\n        input_names: List[str],\n        output_names: List[str],\n        device: Device,\n        nvidia_logger: Any = None,\n        cuda_stream: Any = None,\n        input_tfms: MultiStageTransformation = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Build the model from the serialised engine.\n\n        Args:\n            network_parameters (ModelParams): Model parameters.\n            engine_path (str or Path): Path to the serialised engine. The\n                serialised engine is the serialised version of the engine\n                used for accelerating the inference.\n            input_names (List[str]): Names associated to the model input\n                tensors.\n            output_names (List[str]): Names associated to the model output\n                tensors.\n            device: (Device): Device where the model wil be run.\n            cuda_stream (any, optional): Stream used for communication with\n                Nvidia GPUs.\n            nvidia_logger (any, optional): Logger used by the Nvidia service\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction.\n            input_data (DataManager, optional): User defined data.\n\n        Returns:\n            NvidiaInferenceLearner: The optimized model.\n        \"\"\"\n        if kwargs:\n            logger.warning(\n                f\"Debug: Got extra keywords in \"\n                f\"NvidiaInferenceLearner::from_engine_path: {kwargs}\"\n            )\n        if nvidia_logger is None:\n            nvidia_logger = trt.Logger(trt.Logger.WARNING)\n        if input_tfms is None:\n            input_tfms = MultiStageTransformation([])\n        input_tfms.append(VerifyContiguity())\n        runtime = trt.Runtime(nvidia_logger)\n        with open(engine_path, \"rb\") as f:\n            serialized_engine = f.read()\n        engine = runtime.deserialize_cuda_engine(serialized_engine)\n        return cls(\n            input_tfms=input_tfms,\n            network_parameters=network_parameters,\n            engine=engine,\n            input_names=input_names,\n            output_names=output_names,\n            nvidia_logger=nvidia_logger,\n            cuda_stream=cuda_stream,\n            input_data=input_data,\n            device=device,\n        )\n\n    def _predict_tensors(\n        self,\n        input_ptrs: Generator[Any, None, None],\n        output_ptrs: Generator[Any, None, None],\n        input_shapes: Generator[Any, None, None] = None,\n    ):\n        buffers = [None] * (len(self.input_names) + len(self.output_names))\n        input_idxs = (\n            self.engine[input_name] for input_name in self.input_names\n        )\n        output_idxs = (\n            self.engine[output_name] for output_name in self.output_names\n        )\n        input_shapes = input_shapes or [None] * len(self.input_names)\n        for input_idx, input_ptr, input_shape in zip(\n            input_idxs, input_ptrs, input_shapes\n        ):\n            buffers[input_idx] = input_ptr\n            if input_shape is not None:\n                # If the input shape is empty, we set it to (1,) because\n                # TensorRT doesn't accept empty shapes.\n                if input_shape == torch.Size([]):\n                    input_shape = torch.Size((1,))\n                self.context.set_binding_shape(input_idx, input_shape)\n        for output_idx, output_ptr in zip(output_idxs, output_ptrs):\n            buffers[output_idx] = output_ptr\n        self.context.execute_async_v2(buffers, self.stream_ptr)\n        self._synchronize_stream()\n\n    def get_size(self):\n        return self.engine.serialize().nbytes\n\n    def free_gpu_memory(self):\n        # ONNXtensorrt doesn't need to release gpu memory\n        pass\n\n    def save(self, path: Union[str, Path], **kwargs):\n        \"\"\"Save the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model will\n                be stored.\n            kwargs (Dict): Dictionary of key-value pairs that will be saved in\n                the model metadata file.\n        \"\"\"\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n        serialized_engine = self.engine.serialize()\n        with open(path / NVIDIA_FILENAMES[\"engine\"], \"wb\") as fout:\n            fout.write(serialized_engine)\n        metadata = self._get_metadata(**kwargs)\n        with open(path / NVIDIA_FILENAMES[\"metadata\"], \"w\") as fout:\n            json.dump(metadata.to_dict(), fout)\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        \"\"\"Load the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model is\n                stored.\n            kwargs (Dict): Dictionary of additional arguments for the\n                `from_engine_path` class method.\n\n        Returns:\n            ONNXTensorRTInferenceLearner: The optimized model.\n        \"\"\"\n        path = Path(path)\n        with open(path / NVIDIA_FILENAMES[\"metadata\"], \"r\") as fin:\n            metadata = json.load(fin)\n        metadata.update(kwargs)\n        metadata[\"network_parameters\"] = ModelParams(\n            **metadata[\"network_parameters\"]\n        )\n        input_tfms = metadata.get(\"input_tfms\")\n        if input_tfms is not None:\n            metadata[\"input_tfms\"] = MultiStageTransformation.from_dict(\n                input_tfms\n            )\n        metadata[\"device\"] = Device(DeviceType.GPU)\n        return cls.from_engine_path(\n            engine_path=path / NVIDIA_FILENAMES[\"engine\"],\n            **metadata,\n        )\n\n\nclass PytorchTensorRTInferenceLearner(PytorchBaseInferenceLearner):\n    MODEL_NAME = \"model_optimized.pt\"\n    name = \"TensorRT\"\n\n    def __init__(\n        self,\n        torch_model: ScriptModule,\n        device: Device,\n        **kwargs,\n    ):\n        super().__init__(**kwargs)\n        self.model = torch_model.eval()\n        if device.type is DeviceType.GPU:\n            self.model.to(device.to_torch_format())\n            self.use_gpu = True\n        else:\n            self.use_gpu = False\n        self.device = device\n        self._is_gpu_ready = device.type is DeviceType.GPU\n\n    def get_size(self):\n        with TemporaryDirectory() as tmp_dir:\n            self.save(tmp_dir)\n            return sum(\n                os.path.getsize(Path(tmp_dir) / f)\n                for f in os.listdir(Path(tmp_dir))\n                if os.path.isfile(Path(tmp_dir) / f)\n            )\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:\n        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:\n            self.set_model_on_gpu()\n\n        # PyTorch-TensorRT does not support int64\n        input_tensors = (\n            t.to(self.device.to_torch_format())\n            if t.dtype != torch.int64\n            else t.to(torch.int32).to(self.device.to_torch_format())\n            for t in input_tensors\n        )\n\n        with torch.no_grad():\n            res = self.model(*input_tensors)\n            if not isinstance(res, tuple):\n                res = res.to(self.device.to_torch_format())\n                return (res,)\n            return tuple(out.to(self.device.to_torch_format()) for out in res)\n\n    def save(self, path: Union[str, Path], **kwargs):\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n        metadata = LearnerMetadata.from_model(self, **kwargs)\n        metadata.save(path)\n        torch.jit.save(self.model, path / self.MODEL_NAME)\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        path = Path(path)\n        model = torch.jit.load(path / cls.MODEL_NAME)\n        metadata = LearnerMetadata.read(path)\n        device = Device(DeviceType.GPU)\n        return cls(\n            torch_model=model,\n            network_parameters=ModelParams(**metadata.network_parameters),\n            input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)\n            if metadata.input_tfms is not None\n            else None,\n            device=device,\n        )\n\n\nclass PytorchONNXTensorRTInferenceLearner(\n    ONNXTensorRTInferenceLearner, PytorchBaseInferenceLearner\n):\n    \"\"\"Model optimized using TensorRT with a Pytorch interface.\n\n    This class can be used exactly in the same way as a pytorch Module object.\n    At prediction time it takes as input pytorch tensors given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        engine (any): The tensorRT engine.\n        input_names (List[str]): Names associated to the model input tensors.\n        output_names (List[str]): Names associated to the model output tensors.\n        cuda_stream (any, optional): Stream used for communication with Nvidia\n            GPUs.\n        nvidia_logger (any, optional): Logger used by the Nvidia service.\n    \"\"\"\n\n    def _synchronize_stream(self):\n        self.cuda_stream.synchronize()\n\n    @staticmethod\n    def _get_default_cuda_stream() -> Any:\n        return torch.cuda.default_stream()\n\n    @property\n    def stream_ptr(self):\n        return self.cuda_stream.cuda_stream\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        input_tensors = [\n            input_tensor.to(self.device.to_torch_format())\n            for input_tensor in input_tensors\n        ]\n        if self.network_parameters.dynamic_info is None:\n            if self.output_tensors is None:\n                self.output_tensors = [\n                    torch.Tensor(*output_size)\n                    .to(self.device.to_torch_format())\n                    .to(output_type.to_torch_format())\n                    for output_size, output_type in zip(\n                        self.network_parameters.output_sizes,\n                        self.network_parameters.output_types,\n                    )\n                ]\n            input_sizes = None\n        else:\n            dynamic_info = self.network_parameters.dynamic_info\n            input_sizes = [\n                input_tensor.size() for input_tensor in input_tensors\n            ]\n            self.output_tensors = [\n                torch.Tensor(\n                    *(\n                        x\n                        if i not in dynamic_axis.keys()\n                        else dynamic_info.retrieve_output_dim(\n                            input_sizes, j, i, x\n                        )\n                        for i, x in enumerate(output_size)\n                    ),\n                )\n                .to(self.device.to_torch_format())\n                .to(output_type.to_torch_format())\n                for j, (output_size, output_type, dynamic_axis) in enumerate(\n                    zip(\n                        self.network_parameters.output_sizes,\n                        self.network_parameters.output_types,\n                        dynamic_info.outputs,\n                    )\n                )\n            ]\n\n        input_ptrs = (\n            input_tensor.data_ptr() for input_tensor in input_tensors\n        )\n        output_ptrs = (\n            output_tensor.data_ptr() for output_tensor in self.output_tensors\n        )\n        self._predict_tensors(input_ptrs, output_ptrs, input_sizes)\n        return tuple(\n            output_tensor.to(self.device.to_torch_format())\n            for output_tensor in self.output_tensors\n        )\n\n\nclass BaseArrayONNXTensorRTInferenceLearner(ONNXTensorRTInferenceLearner, ABC):\n    \"\"\"Base Model that can be used for all array-based\n    NvidiaInferenceLearners.\n    \"\"\"\n\n    def _synchronize_stream(self):\n        self.cuda_stream.synchronize()\n\n    @staticmethod\n    def _get_default_cuda_stream() -> Any:\n        return polygraphy.cuda.Stream()\n\n    @property\n    def stream_ptr(self):\n        return self.cuda_stream.ptr\n\n    @staticmethod\n    def _convert_to_array_and_free_memory(cuda_array) -> np.ndarray:\n        array = cuda_array.numpy()\n        cuda_array.free()\n        return array\n\n    def _predict_array(\n        self,\n        cuda_input_arrays: List,\n        input_shapes: Optional[List[Tuple[int, ...]]],\n    ) -> Generator[np.ndarray, None, None]:\n\n        if self.network_parameters.dynamic_info is None:\n            cuda_output_arrays = [\n                polygraphy.cuda.DeviceArray(\n                    shape=output_size,\n                    dtype=output_type.to_numpy_format(),\n                )\n                for output_size, output_type in zip(\n                    self.network_parameters.output_sizes,\n                    self.network_parameters.output_types,\n                )\n            ]\n        else:\n            dynamic_info = self.network_parameters.dynamic_info\n            cuda_output_arrays = [\n                polygraphy.cuda.DeviceArray(\n                    shape=tuple(\n                        x\n                        if i not in dyn_out_axis.keys()\n                        else dynamic_info.retrieve_output_dim(\n                            input_shapes, j, i, x\n                        )\n                        for i, x in enumerate(output_size)\n                    ),\n                    dtype=output_type.to_numpy_format(),\n                )\n                for j, (output_size, output_type, dyn_out_axis) in enumerate(\n                    zip(\n                        self.network_parameters.output_sizes,\n                        self.network_parameters.output_types,\n                        dynamic_info.outputs,\n                    )\n                )\n            ]\n        input_ptrs = (cuda_array.ptr for cuda_array in cuda_input_arrays)\n        output_ptrs = (cuda_array.ptr for cuda_array in cuda_output_arrays)\n        self._predict_tensors(input_ptrs, output_ptrs, input_shapes)\n        for cuda_input_array in cuda_input_arrays:\n            cuda_input_array.free()\n        return (\n            self._convert_to_array_and_free_memory(array)\n            for array in cuda_output_arrays\n        )\n\n\nclass TensorflowONNXTensorRTInferenceLearner(\n    BaseArrayONNXTensorRTInferenceLearner, TensorflowBaseInferenceLearner\n):\n    \"\"\"Model optimized using TensorRT with a tensorflow interface.\n\n    This class can be used exactly in the same way as a tf.Module or\n    keras.Model object.\n    At prediction time it takes as input tensorflow tensors given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        engine (any): The tensorRT engine.\n        input_names (List[str]): Names associated to the model input tensors.\n        output_names (List[str]): Names associated to the model output tensors.\n        cuda_stream (any, optional): Stream used for communication with Nvidia\n            GPUs.\n        nvidia_logger (any, optional): Logger used by the Nvidia service.\n    \"\"\"\n\n    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        cuda_input_arrays = [\n            polygraphy.cuda.DeviceArray(\n                shape=tuple(input_tensor.shape),\n                dtype=input_tensor.numpy().dtype,\n            ).copy_from(input_tensor.numpy(), stream=self.cuda_stream)\n            for input_tensor in input_tensors\n        ]\n        input_shapes = (\n            [tuple(input_tensor.shape) for input_tensor in input_tensors]\n            if self.network_parameters.dynamic_info is not None\n            else None\n        )\n        out_arrays = self._predict_array(cuda_input_arrays, input_shapes)\n        return tuple(tf.convert_to_tensor(array) for array in out_arrays)\n\n\nclass NumpyONNXTensorRTInferenceLearner(\n    BaseArrayONNXTensorRTInferenceLearner, NumpyBaseInferenceLearner\n):\n    \"\"\"Model optimized using TensorRT with a tensorflow interface.\n\n    This class can be used exactly in the same way as a tf.Module or\n    keras.Model object.\n    At prediction time it takes as input tensorflow tensors given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        engine (any): The tensorRT engine.\n        input_names (List[str]): Names associated to the model input tensors.\n        output_names (List[str]): Names associated to the model output tensors.\n        cuda_stream (any, optional): Stream used for communication with Nvidia\n            GPUs.\n        nvidia_logger (any, optional): Logger used by the Nvidia service.\n    \"\"\"\n\n    def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[np.ndarray]): Input tensors belonging to\n                the same batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[np.ndarray]: Output tensors. Note that the output tensors\n                does not correspond to the prediction on the input tensors\n                with a 1 to 1 mapping. In fact the output tensors are produced\n                as the multiple-output of the model given a (multi-) tensor\n                input.\n        \"\"\"\n        cuda_input_arrays = [\n            polygraphy.cuda.DeviceArray(\n                shape=tuple(input_tensor.shape), dtype=input_tensor.dtype\n            ).copy_from(input_tensor, stream=self.cuda_stream)\n            for input_tensor in input_tensors\n        ]\n        input_shapes = (\n            [tuple(input_tensor.shape) for input_tensor in input_tensors]\n            if self.network_parameters.dynamic_info is not None\n            else None\n        )\n        return tuple(self._predict_array(cuda_input_arrays, input_shapes))\n\n\nTENSOR_RT_INFERENCE_LEARNERS: Dict[\n    DeepLearningFramework, Type[ONNXTensorRTInferenceLearner]\n] = {\n    DeepLearningFramework.PYTORCH: PytorchONNXTensorRTInferenceLearner,\n    DeepLearningFramework.TENSORFLOW: TensorflowONNXTensorRTInferenceLearner,\n    DeepLearningFramework.NUMPY: NumpyONNXTensorRTInferenceLearner,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/tensorflow.py",
    "content": "import pickle\nfrom pathlib import Path\nfrom typing import Tuple, Union, Dict, Type\n\nfrom nebullvm.config import TENSORFLOW_BACKEND_FILENAMES\nfrom nebullvm.core.models import DeviceType, Device, ModelParams\nfrom nebullvm.operations.inference_learners.base import (\n    TensorflowBaseInferenceLearner,\n    LearnerMetadata,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\n\n\nclass TensorflowBackendInferenceLearner(TensorflowBaseInferenceLearner):\n    name = \"XLA\"\n\n    def __init__(self, tf_model: tf.Module, device: Device, **kwargs):\n        super(TensorflowBackendInferenceLearner, self).__init__(**kwargs)\n        self.model = tf_model\n        self.device = device\n        self._is_gpu_ready = self.device.type is DeviceType.GPU\n\n    def get_size(self):\n        return len(pickle.dumps(self.model, -1))\n\n    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:\n        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:\n            self.set_model_on_gpu()\n        with tf.device(self.device.to_tf_format()):\n            res = self.model(input_tensors)\n        if not isinstance(res, tuple):\n            return (res,)\n        return res\n\n    def save(self, path: Union[str, Path], **kwargs):\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n        metadata = LearnerMetadata.from_model(self, **kwargs)\n        metadata.save(path)\n        self.model.save(path / TENSORFLOW_BACKEND_FILENAMES[\"tf_model\"])\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        path = Path(path)\n        metadata = LearnerMetadata.read(path)\n        network_parameters = ModelParams(**metadata.network_parameters)\n        input_tfms = metadata.input_tfms\n        model = tf.keras.models.load_model(\n            path / TENSORFLOW_BACKEND_FILENAMES[\"tf_model\"]\n        )\n        device = Device.from_str(metadata.device)\n        return cls(\n            tf_model=model,\n            network_parameters=network_parameters,\n            input_tfms=input_tfms,\n            device=device,\n        )\n\n\nclass TFLiteBackendInferenceLearner(TensorflowBaseInferenceLearner):\n    name = \"TFLite\"\n\n    def __init__(self, tflite_file: bytes, device: Device, **kwargs):\n        super(TFLiteBackendInferenceLearner, self).__init__(**kwargs)\n        self.tflite_file = tflite_file\n        self.interpreter = tf.lite.Interpreter(model_content=tflite_file)\n        self.device = device\n\n    def get_size(self):\n        return len(self.tflite_file)\n\n    def free_gpu_memory(self):\n        raise NotImplementedError(\n            \"TFLite does not support GPU inference on Nvidia devices\"\n        )\n\n    def run(self, *input_tensors: tf.Tensor):\n        input_details = self.interpreter.get_input_details()\n        output_details = self.interpreter.get_output_details()\n        if self.network_parameters.dynamic_info:\n            for i, (input_tensor, detail) in enumerate(\n                zip(input_tensors, input_details)\n            ):\n                if input_tensor.shape != tuple(detail[\"shape\"]):\n                    self.interpreter.resize_tensor_input(i, input_tensor.shape)\n        self.interpreter.allocate_tensors()\n        for i, input_tensor in enumerate(input_tensors):\n            self.interpreter.set_tensor(i, input_tensor)\n        self.interpreter.invoke()\n        return tuple(\n            tf.convert_to_tensor(\n                self.interpreter.get_tensor(output_detail[\"index\"])\n            )\n            for output_detail in output_details\n        )\n\n    def save(self, path: Union[str, Path], **kwargs):\n        path = Path(path)\n        metadata = LearnerMetadata.from_model(self, **kwargs)\n        metadata.save(path)\n        with open(\n            path / TENSORFLOW_BACKEND_FILENAMES[\"tflite_model\"], \"wb\"\n        ) as f:\n            f.write(self.tflite_file)\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        path = Path(path)\n        tflite_file_path = str(\n            path / TENSORFLOW_BACKEND_FILENAMES[\"tflite_model\"]\n        )\n\n        with open(tflite_file_path, \"rb\") as f:\n            tflite_file = f.read()\n\n        metadata = LearnerMetadata.read(path)\n        network_parameters = ModelParams(**metadata.network_parameters)\n        input_tfms = metadata.input_tfms\n        device = Device.from_str(metadata.device)\n        return cls(\n            tflite_file=tflite_file,\n            network_parameters=network_parameters,\n            input_tfms=input_tfms,\n            device=device,\n        )\n\n\nTF_BACKEND_LEARNERS_DICT: Dict[\n    str,\n    Type[\n        Union[TensorflowBackendInferenceLearner, TFLiteBackendInferenceLearner]\n    ],\n] = {\n    \"tf\": TensorflowBackendInferenceLearner,\n    \"tflite\": TFLiteBackendInferenceLearner,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/torch_dynamo.py",
    "content": "from pathlib import Path\nfrom typing import Union\n\nfrom nebullvm.operations.inference_learners.torchscript import (\n    TorchScriptInferenceLearner,\n)\n\n\nclass TorchDynamoInferenceLearner(TorchScriptInferenceLearner):\n    name = \"TorchDynamo\"\n\n    def save(self, path: Union[str, Path], **kwargs):\n        # TODO: Implement save function\n        # Saving it like a normal PyTorch model raises this error:\n        # https://github.com/pytorch/pytorch/issues/93470\n        raise NotImplementedError\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        # TODO: Implement load function\n        raise NotImplementedError\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/torch_neuron.py",
    "content": "import os\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\n\nfrom nebullvm.operations.inference_learners.torchscript import (\n    TorchScriptInferenceLearner,\n)\n\n\nclass TorchNeuronInferenceLearner(TorchScriptInferenceLearner):\n    name = \"TorchNeuron\"\n\n    def get_size(self):\n        with TemporaryDirectory() as tmp_dir:\n            self.save(tmp_dir)\n            return sum(\n                os.path.getsize(Path(tmp_dir) / f)\n                for f in os.listdir(Path(tmp_dir))\n                if os.path.isfile(Path(tmp_dir) / f)\n            )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/torch_xla.py",
    "content": "import os\nimport pickle\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Tuple, Union\n\nfrom nebullvm.core.models import Device, DeviceType, ModelParams\nfrom nebullvm.operations.inference_learners.base import (\n    PytorchBaseInferenceLearner,\n    LearnerMetadata,\n)\nfrom nebullvm.optional_modules.torch import (\n    torch,\n)\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass TorchXLAInferenceLearner(PytorchBaseInferenceLearner):\n    MODEL_NAME = \"model_scripted.pt\"\n    name = \"TorchXLA\"\n\n    def __init__(self, torch_model: torch.nn.Module, device: Device, **kwargs):\n        super().__init__(**kwargs)\n        self.model = torch_model.eval()\n        if device.type is DeviceType.TPU:\n            self.model.to(device.to_torch_format())\n        self.device = device\n        self._is_gpu_ready = self.device.type is DeviceType.TPU\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:\n        if self.device.type is DeviceType.TPU and not self._is_gpu_ready:\n            self.set_model_on_gpu()\n        if self.device.type is DeviceType.TPU:\n            input_tensors = (\n                t.to(self.device.to_torch_format()) for t in input_tensors\n            )\n        with torch.no_grad():\n            res = self.model(*input_tensors)\n            if not isinstance(res, tuple):\n                return (res,)\n            return tuple(out for out in res)\n\n    def get_size(self):\n        try:\n            if hasattr(self.model, \"core_model\"):\n                return len(pickle.dumps(self.model.core_model, -1))\n            else:\n                # Normal torch model\n                return len(pickle.dumps(self.model, -1))\n        except RuntimeError:\n            with TemporaryDirectory() as tmp_dir:\n                self.save(tmp_dir)\n                return sum(\n                    os.path.getsize(Path(tmp_dir) / f)\n                    for f in os.listdir(Path(tmp_dir))\n                    if os.path.isfile(Path(tmp_dir) / f)\n                )\n\n    def save(self, path: Union[str, Path], **kwargs):\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n        metadata = LearnerMetadata.from_model(self, **kwargs)\n        metadata.save(path)\n        self.model.cpu()\n        torch.save(self.model, path / self.MODEL_NAME)\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        path = Path(path)\n        model = torch.load(path / cls.MODEL_NAME)\n        metadata = LearnerMetadata.read(path)\n        device = Device.from_str(metadata.device)\n        model.to(device.to_torch_format())\n        return cls(\n            torch_model=model,\n            network_parameters=ModelParams(**metadata.network_parameters),\n            input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)\n            if metadata.input_tfms is not None\n            else None,\n            device=device,\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/torchscript.py",
    "content": "from pathlib import Path\nfrom typing import Tuple, Union, Optional, List\n\nfrom nebullvm.core.models import Device, DeviceType, ModelParams\nfrom nebullvm.operations.inference_learners.base import (\n    PytorchBaseInferenceLearner,\n    LearnerMetadata,\n)\nfrom nebullvm.optional_modules.torch import (\n    torch,\n    symbolic_trace,\n    Module,\n    ScriptModule,\n    GraphModule,\n)\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass TorchScriptInferenceLearner(PytorchBaseInferenceLearner):\n    MODEL_NAME = \"model_scripted.pt\"\n    name = \"TorchScript\"\n\n    def __init__(self, torch_model: ScriptModule, device: Device, **kwargs):\n        super().__init__(**kwargs)\n        self.model = torch_model.eval()\n        if device.type is DeviceType.GPU:\n            self.model.to(device.to_torch_format())\n        self.device = device\n        self._is_gpu_ready = self.device.type is DeviceType.GPU\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:\n        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:\n            self.set_model_on_gpu()\n        if self.device.type is DeviceType.GPU:\n            input_tensors = (\n                t.to(self.device.to_torch_format()) for t in input_tensors\n            )\n        with torch.no_grad():\n            res = self.model(*input_tensors)\n            if not isinstance(res, tuple):\n                res = res.to(self.device.to_torch_format())\n                return (res,)\n            return tuple(out.to(self.device.to_torch_format()) for out in res)\n\n    def save(self, path: Union[str, Path], **kwargs):\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n        metadata = LearnerMetadata.from_model(self, **kwargs)\n        metadata.save(path)\n\n        torch.jit.save(self.model, path / self.MODEL_NAME)\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        path = Path(path)\n        model = torch.jit.load(path / cls.MODEL_NAME)\n        metadata = LearnerMetadata.read(path)\n        device = Device.from_str(metadata.device)\n        return cls(\n            torch_model=model,\n            network_parameters=ModelParams(**metadata.network_parameters),\n            input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)\n            if metadata.input_tfms is not None\n            else None,\n            device=device,\n        )\n\n    @classmethod\n    def from_torch_model(\n        cls,\n        model: Union[Module, GraphModule],\n        network_parameters: ModelParams,\n        device: Device,\n        input_tfms: Optional[MultiStageTransformation] = None,\n        input_data: List[torch.Tensor] = None,\n    ):\n        if device.type is DeviceType.GPU:\n            input_data = [t.to(device.to_torch_format()) for t in input_data]\n\n        if not isinstance(model, torch.fx.GraphModule):\n            model.eval()\n            try:\n                model_scripted = symbolic_trace(model)\n                model_scripted = torch.jit.script(model_scripted)\n            except Exception:\n                try:\n                    model_scripted = torch.jit.script(model)\n                except Exception:\n                    model_scripted = torch.jit.trace(model, tuple(input_data))\n        else:\n            model_scripted = torch.jit.script(model)\n\n        return cls(\n            torch_model=model_scripted,\n            network_parameters=network_parameters,\n            input_tfms=input_tfms,\n            input_data=input_data,\n            device=device,\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/tvm.py",
    "content": "import os\nimport shutil\nfrom abc import ABC\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Union, Type, Dict, Any, List, Generator, Tuple, Optional\n\nimport numpy as np\n\nfrom nebullvm.config import (\n    TVM_FILENAMES,\n)\nfrom nebullvm.core.models import Device, ModelParams, DeepLearningFramework\nfrom nebullvm.operations.inference_learners.base import (\n    BaseInferenceLearner,\n    LearnerMetadata,\n    PytorchBaseInferenceLearner,\n    TensorflowBaseInferenceLearner,\n    NumpyBaseInferenceLearner,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.optional_modules.tvm import (\n    GraphModule,\n    tvm,\n    ExecutorFactoryModule,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n    HalfPrecisionTransformation,\n)\n\n\nclass ApacheTVMInferenceLearner(BaseInferenceLearner, ABC):\n    \"\"\"Model optimized using ApacheTVM.\n\n    The class cannot be directly instantiated, but implements all the core\n    methods needed for using ApacheTVM at inference time.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        graph_executor_module (GraphModule): The graph executor. This is the\n            central component in the ApacheTVM optimized model execution.\n        input_names (List[str]): Names associated to the model input tensors.\n        lib (Module): Component needed for loading the ApacheTVM optimized\n            model.\n        target (str): Target device. It can be wither `llvm` for targeting CPUs\n            or \"cuda\" for targeting GPUs.\n        engine_path (Path, optional): Path to the serialized engine. To be used\n            after loading the model (avoiding double engine serialization).\n    \"\"\"\n\n    name = \"ApacheTVM\"\n\n    def __init__(\n        self,\n        graph_executor_module: GraphModule,\n        input_names: List[str],\n        lib: ExecutorFactoryModule,\n        target: str,\n        device: Device,\n        engine_path: Path = None,\n        **kwargs\n    ):\n        super().__init__(**kwargs)\n        self.graph_executor_module = graph_executor_module\n        self.input_names = input_names\n        self.lib = lib\n        self.target = target\n        self.engine_path = (\n            self._store_file(engine_path)\n            if engine_path is not None\n            else engine_path\n        )\n        self.device = device\n\n    def get_size(self):\n        with TemporaryDirectory() as tmp_dir:\n            self.save(tmp_dir)\n            return sum(\n                os.path.getsize(Path(tmp_dir) / f)\n                for f in os.listdir(Path(tmp_dir))\n                if os.path.isfile(Path(tmp_dir) / f)\n            )\n\n    def _has_half_precision_transformation(self):\n        for tfm in self.input_tfms.to_list():\n            if isinstance(tfm, HalfPrecisionTransformation):\n                return True\n        return False\n\n    def _predict_array(\n        self, input_arrays: Generator[np.ndarray, None, None]\n    ) -> Generator[np.ndarray, None, None]:\n        for name, array in zip(self.input_names, input_arrays):\n            self.graph_executor_module.set_input(name, array)\n        self.graph_executor_module.run()\n\n        tvm_outputs = (\n            self.graph_executor_module.get_output(\n                i,\n                tvm.nd.empty(\n                    shape=output_size,\n                    dtype=\"float16\"\n                    if self._has_half_precision_transformation()\n                    else \"float32\",\n                ),\n            ).numpy()\n            for i, output_size in enumerate(\n                self.network_parameters.output_sizes\n            )\n        )\n        return tvm_outputs\n\n    def free_gpu_memory(self):\n        # TODO: check if tvm needs to release GPU\n        pass\n\n    def save(self, path: Union[str, Path], **kwargs):\n        \"\"\"Save the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model will\n                be stored.\n            kwargs (Dict): Dictionary of key-value pairs that will be saved in\n                the model metadata file.\n        \"\"\"\n        path = Path(path)\n        path.mkdir(exist_ok=True)\n        metadata = LearnerMetadata.from_model(\n            self, input_names=self.input_names, target=self.target, **kwargs\n        )\n        metadata.save(path)\n        if self.engine_path is None:\n            self.lib.export_library(path / TVM_FILENAMES[\"engine\"])\n        else:\n            shutil.copy(self.engine_path, path)\n\n    @classmethod\n    def load(cls, path: Union[Path, str], **kwargs):\n        \"\"\"Load the model.\n\n        Args:\n            path (Path or str): Path to the directory where the model is\n                stored.\n            kwargs (Dict): Dictionary of additional arguments for the\n                `from_runtime_module` class method.\n\n        Returns:\n            ApacheTVMInferenceLearner: The optimized model.\n        \"\"\"\n        path = Path(path)\n        metadata = LearnerMetadata.read(path).to_dict()\n        network_parameters = ModelParams(**metadata[\"network_parameters\"])\n        lib = tvm.runtime.load_module(path / TVM_FILENAMES[\"engine\"])\n        target_device = metadata[\"target\"]\n        input_names = metadata[\"input_names\"]\n        input_tfms = metadata.get(\"input_tfms\")\n        if input_tfms is not None:\n            metadata[\"input_tfms\"] = MultiStageTransformation.from_dict(\n                input_tfms\n            )\n        device = Device.from_str(metadata[\"device\"])\n        self = cls.from_runtime_module(\n            network_parameters=network_parameters,\n            lib=lib,\n            target_device=target_device,\n            input_names=input_names,\n            device=device,\n        )\n        self.engine_path = path / TVM_FILENAMES[\"engine\"]\n        return self\n\n    @classmethod\n    def from_runtime_module(\n        cls,\n        network_parameters: ModelParams,\n        lib: ExecutorFactoryModule,\n        target_device: str,\n        input_names: List[str],\n        device: Device,\n        input_tfms: MultiStageTransformation = None,\n        input_data: DataManager = None,\n    ):\n        \"\"\"Build the model from the runtime module (lib).\n\n        Args:\n            network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n            lib (Module): Component needed for loading the ApacheTVM optimized\n                model.\n            target_device (str): The target device. Either `llvm` (CPU)\n                or `cuda`.\n            input_names (List[str]): Names associated to the model input\n                tensors.\n            device (Device): The device where the model will be executed.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction.\n            input_data (DataManager, optional): User defined data.\n        \"\"\"\n        dev = tvm.device(str(target_device), 0)\n        graph_executor_module = GraphModule(lib[\"default\"](dev))\n        return cls(\n            input_tfms=input_tfms,\n            network_parameters=network_parameters,\n            graph_executor_module=graph_executor_module,\n            input_names=input_names,\n            lib=lib,\n            target=target_device,\n            input_data=input_data,\n            device=device,\n        )\n\n\nclass BaseArrayApacheTVMInferenceLearner(ApacheTVMInferenceLearner, ABC):\n    \"\"\"Base Model that can be used for all array-based\n    ApacheTVMInferenceLearners.\n    \"\"\"\n\n    def _inner_predict(\n        self,\n        input_arrays: Generator[np.ndarray, None, None],\n        input_shapes: Optional[List[Tuple[int, ...]]],\n    ) -> Generator[np.ndarray, None, None]:\n        if self.network_parameters.dynamic_info is not None:\n            input_arrays = (\n                np.pad(\n                    input_array,\n                    [\n                        (0, abs(x - y))\n                        for x, y in zip(\n                            input_array.shape,\n                            input_size,\n                        )\n                    ],\n                    mode=\"constant\",\n                    constant_values=0,\n                )\n                for input_array, input_size in zip(\n                    input_arrays, self.network_parameters.input_sizes\n                )\n            )\n\n        output_arrays = self._predict_array(input_arrays)\n        if self.network_parameters.dynamic_info is not None:\n            assert input_shapes is not None\n            dynamic_info = self.network_parameters.dynamic_info\n            return (\n                output_array[\n                    tuple(\n                        slice(\n                            0,\n                            None\n                            if x not in out_dynamic_dict.keys()\n                            else dynamic_info.retrieve_output_dim(\n                                input_shapes, j, i, x\n                            ),\n                        )\n                        for i, x in enumerate(output_array.shape)\n                    )\n                ]\n                for j, (output_array, out_dynamic_dict) in enumerate(\n                    zip(output_arrays, dynamic_info.outputs)\n                )\n            )\n\n        return output_arrays\n\n\nclass PytorchApacheTVMInferenceLearner(\n    BaseArrayApacheTVMInferenceLearner, PytorchBaseInferenceLearner\n):\n    \"\"\"Model optimized using ApacheTVM with a Pytorch interface.\n\n    This class can be used exactly in the same way as a pytorch Module object.\n    At prediction time it takes as input pytorch tensors given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        graph_executor_module (GraphModule): The graph executor. This is the\n            central component in the ApacheTVM optimized model execution.\n        input_names (List[str]): Names associated to the model input tensors.\n        lib (Module): Component needed for loading the ApacheTVM optimized\n            model.\n        target (str): Target device. It can be wither `llvm` for targeting CPUs\n            or \"cuda\" for targeting GPUs.\n    \"\"\"\n\n    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        input_arrays = (\n            input_tensor.cpu().detach().numpy()\n            for input_tensor in input_tensors\n        )\n        input_shapes = (\n            [tuple(input_tensor.shape) for input_tensor in input_tensors]\n            if self.network_parameters.dynamic_info is not None\n            else None\n        )\n        output_arrays = self._inner_predict(input_arrays, input_shapes)\n        return tuple(\n            torch.from_numpy(array).to(self.device.to_torch_format())\n            for array in output_arrays\n        )\n\n    @staticmethod\n    def _convert_device(device: Any):\n        if isinstance(device, int):\n            return \"cpu\"\n        return device\n\n\nclass TensorflowApacheTVMInferenceLearner(\n    BaseArrayApacheTVMInferenceLearner, TensorflowBaseInferenceLearner\n):\n    \"\"\"Model optimized using ApacheTVM with a tensorflow interface.\n\n    This class can be used exactly in the same way as a tf.Module or\n    keras.Model object.\n    At prediction time it takes as input tensorflow tensors given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        graph_executor_module (GraphModule): The graph executor. This is the\n            central component in the ApacheTVM optimized model execution.\n        input_names (List[str]): Names associated to the model input tensors.\n        lib (Module): Component needed for loading the ApacheTVM optimized\n            model.\n        target (str): Target device. It can be wither `llvm` for targeting CPUs\n            or \"cuda\" for targeting GPUs.\n    \"\"\"\n\n    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[Tensor]): Input tensors belonging to the same\n                batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[Tensor]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        input_arrays = (input_tensor.numpy() for input_tensor in input_tensors)\n        input_shapes = (\n            [tuple(input_tensor.shape) for input_tensor in input_tensors]\n            if self.network_parameters.dynamic_info is not None\n            else None\n        )\n        return tuple(\n            tf.convert_to_tensor(out)\n            for out in self._inner_predict(input_arrays, input_shapes)\n        )\n\n\nclass NumpyApacheTVMInferenceLearner(\n    BaseArrayApacheTVMInferenceLearner, NumpyBaseInferenceLearner\n):\n    \"\"\"Model optimized using ApacheTVM with a tensorflow interface.\n\n    This class can be used exactly in the same way as a tf.Module or\n    keras.Model object.\n    At prediction time it takes as input tensorflow tensors given as positional\n    arguments.\n\n    Attributes:\n        network_parameters (ModelParams): The model parameters as batch\n                size, input and output sizes.\n        graph_executor_module (GraphModule): The graph executor. This is the\n            central component in the ApacheTVM optimized model execution.\n        input_names (List[str]): Names associated to the model input tensors.\n        lib (Module): Component needed for loading the ApacheTVM optimized\n            model.\n        target (str): Target device. It can be wither `llvm` for targeting CPUs\n            or \"cuda\" for targeting GPUs.\n    \"\"\"\n\n    def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:\n        \"\"\"Predict on the input tensors.\n\n        Note that the input tensors must be on the same batch. If a sequence\n        of tensors is given when the model is expecting a single input tensor\n        (with batch size >= 1) an error is raised.\n\n        Args:\n            input_tensors (Tuple[ndarray]): Input tensors belonging to the\n                same batch. The tensors are expected having dimensions\n                (batch_size, dim1, dim2, ...).\n\n        Returns:\n            Tuple[ndarray]: Output tensors. Note that the output tensors does\n                not correspond to the prediction on the input tensors with a\n                1 to 1 mapping. In fact the output tensors are produced as the\n                multiple-output of the model given a (multi-) tensor input.\n        \"\"\"\n        input_arrays = (input_tensor for input_tensor in input_tensors)\n        input_shapes = (\n            [tuple(input_tensor.shape) for input_tensor in input_tensors]\n            if self.network_parameters.dynamic_info is not None\n            else None\n        )\n        return tuple(self._inner_predict(input_arrays, input_shapes))\n\n\nAPACHE_TVM_INFERENCE_LEARNERS: Dict[\n    DeepLearningFramework, Type[ApacheTVMInferenceLearner]\n] = {\n    DeepLearningFramework.PYTORCH: PytorchApacheTVMInferenceLearner,\n    DeepLearningFramework.TENSORFLOW: TensorflowApacheTVMInferenceLearner,\n    DeepLearningFramework.NUMPY: NumpyApacheTVMInferenceLearner,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/inference_learners/utils.py",
    "content": "from pathlib import Path\nfrom typing import Union, Any\n\nfrom nebullvm.operations.inference_learners.base import LearnerMetadata\nfrom nebullvm.optional_modules.diffusers import StableDiffusionPipeline\nfrom nebullvm.tools.diffusers import postprocess_diffusers\n\n\ndef load_model(path: Union[Path, str], pipe: StableDiffusionPipeline = None):\n    \"\"\"Load the optimized model previously saved in the given path.\n\n    Args:\n        path (Union[Path, str]): Path to the directory where the model is\n            saved.\n        pipe (StableDiffusionPipeline): Diffusion pipeline to be used for\n            loading the model. This parameter is only needed if the model\n            to be loaded is a diffusion model. Default: None.\n\n    Returns:\n        InferenceLearner: Model optimized by Speedster.\n    \"\"\"\n    optimized_model = LearnerMetadata.read(path).load_model(path)\n    if pipe is not None:\n        optimized_model = postprocess_diffusers(\n            optimized_model, pipe, optimized_model.device\n        )\n    return optimized_model\n\n\ndef save_model(model: Any, path: Union[Path, str]):\n    \"\"\"Save the optimized model in the given path.\n\n    Args:\n        model (Any): Model to be saved.\n        path (Union[Path, str]): Path to the directory where to\n            save the model.\n\n    Returns:\n        InferenceLearner: Model optimized by Speedster.\n    \"\"\"\n    if isinstance(model, StableDiffusionPipeline):\n        model.unet.model.save(path)\n    else:\n        model.save(path)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/measures/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/measures/base.py",
    "content": "import abc\n\nfrom nebullvm.operations.base import Operation\n\n\nclass Measure(Operation, abc.ABC):\n    def __init__(self):\n        super().__init__()\n        self.measure_result = None\n\n    @abc.abstractmethod\n    def execute(self, **kwargs):\n        raise NotImplementedError()\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/measures/measures.py",
    "content": "from typing import List, Tuple, Any, Callable, Dict\n\nimport numpy as np\n\nfrom nebullvm.config import QUANTIZATION_DATA_NUM\nfrom nebullvm.core.models import (\n    BenchmarkOriginalModelResult,\n    DeepLearningFramework,\n)\nfrom nebullvm.operations.inference_learners.base import BaseInferenceLearner\nfrom nebullvm.operations.measures.base import Measure\nfrom nebullvm.operations.measures.utils import (\n    compute_torch_latency,\n    compute_tf_latency,\n    compute_onnx_latency,\n    compute_relative_difference,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.onnx import run_onnx_model\nfrom nebullvm.tools.pytorch import run_torch_model\nfrom nebullvm.tools.tf import run_tf_model\n\nCOMPUTE_OUTPUT_FRAMEWORK: Dict[DeepLearningFramework, Callable] = {\n    DeepLearningFramework.PYTORCH: run_torch_model,\n    DeepLearningFramework.TENSORFLOW: run_tf_model,\n    DeepLearningFramework.NUMPY: run_onnx_model,\n}\n\nCOMPUTE_LATENCY_FRAMEWORK: Dict[DeepLearningFramework, Callable] = {\n    DeepLearningFramework.PYTORCH: compute_torch_latency,\n    DeepLearningFramework.TENSORFLOW: compute_tf_latency,\n    DeepLearningFramework.NUMPY: compute_onnx_latency,\n}\n\n\nclass MetricDropMeasure(Measure):\n    def __init__(self):\n        super().__init__()\n        self.valid = None\n\n    def execute(\n        self,\n        optimized_learner: BaseInferenceLearner,\n        input_data: List[Tuple[Any, ...]],\n        base_outputs_list: List[Tuple[Any, ...]],\n        perf_loss_ths: float,\n        metric_func: Callable = None,\n        ys: List = None,\n        aggregation_func: Callable = np.mean,\n    ):\n        metric_func = metric_func or compute_relative_difference\n        relative_differences = []\n        if ys is None:\n            ys = [None] * len(input_data)\n\n        assert len(input_data) == len(base_outputs_list) == len(ys), (\n            \"INTERNAL ASSERT FAILED: error during computation of precision \"\n            \"of the optimized model, got wrong dimensions of the data. \"\n        )\n\n        for inputs, base_outputs, y in zip(input_data, base_outputs_list, ys):\n            opt_outputs = optimized_learner(*inputs)\n            relative_difference = max(\n                metric_func(base_output, opt_output, y)\n                for base_output, opt_output in zip(base_outputs, opt_outputs)\n            )\n            relative_differences.append(relative_difference)\n        relative_difference = aggregation_func(relative_differences)\n        self.valid = relative_difference <= perf_loss_ths\n        self.measure_result = relative_difference\n\n    def get_result(self) -> Tuple[bool, float]:\n        return self.valid, self.measure_result\n\n\nclass LatencyOriginalModelMeasure(Measure):\n    def __init__(self):\n        super().__init__()\n        self.outputs = None\n\n    def execute(\n        self,\n        model: Any,\n        input_data: DataManager,\n        dl_framework: DeepLearningFramework,\n    ) -> BenchmarkOriginalModelResult:\n        self.logger.info(\"Benchmark performance of original model\")\n\n        self.outputs = [\n            tuple(\n                COMPUTE_OUTPUT_FRAMEWORK[dl_framework](\n                    model, tuple(input_tensors[0]), self.device\n                )\n            )\n            for input_tensors in input_data\n        ]\n\n        inputs = input_data.get_list(QUANTIZATION_DATA_NUM)\n        self.measure_result, _ = COMPUTE_LATENCY_FRAMEWORK[dl_framework](\n            inputs, model, self.device\n        )\n        self.logger.info(\n            f\"Original model latency: {self.measure_result} sec/iter\"\n        )\n\n        return BenchmarkOriginalModelResult(\n            latency_seconds=self.measure_result,\n            model_outputs=self.outputs,\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/measures/utils.py",
    "content": "import time\nfrom typing import Tuple, List, Union, Any\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.config import ONNX_PROVIDERS\nfrom nebullvm.core.models import Device, DeviceType\nfrom nebullvm.operations.inference_learners.base import BaseInferenceLearner\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch, Module\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.onnx import (\n    convert_to_numpy,\n    get_input_names,\n    get_output_names,\n)\n\n\ndef compute_torch_latency(\n    xs: List[Tuple[torch.Tensor]],\n    model: Module,\n    device: Device,\n    steps: int = 100,\n    warmup_steps: int = 10,\n) -> Tuple[float, List[float]]:\n    \"\"\"Compute the latency associated with the torch model.\n\n    Args:\n        xs (List[Tuple[torch.Tensor]]): List of tuples containing the\n            input tensors (a single batch for the model).\n        model (Module): Torch model.\n        device (Device): Device where computing the latency.\n        steps (int, optional): Number of input data to be used to compute the\n            latency of the model. It must be a number <= len(xs). Default: 100.\n        warmup_steps (int, optional): Number of input data to be used to warm\n            up the model. It must be a number <= len(xs). Default: 10.\n\n    Returns:\n        Float: Average latency.\n        List[Float]: List of latencies obtained.\n    \"\"\"\n    if device.type is not DeviceType.TPU:\n        xs = [\n            tuple(t.to(device.to_torch_format()) for t in tensors)\n            for tensors in xs\n        ]\n        model = model.to(device.to_torch_format())\n    model.eval()\n    latencies = []\n    with torch.no_grad():\n        for i in range(warmup_steps):\n            _ = model.forward(*xs[i])\n        for i in range(steps):\n            starting_time = time.time()\n            _ = model.forward(*xs[i])\n            latencies.append(time.time() - starting_time)\n        latency = np.mean(latencies)\n    return latency, latencies\n\n\ndef compute_tf_latency(\n    xs: List[Tuple[tf.Tensor]],\n    model: Union[tf.Module, tf.keras.Model],\n    device: Device,\n    steps: int = 100,\n    warmup_steps: int = 10,\n) -> Tuple[float, List[float]]:\n    \"\"\"Compute the latency associated with the tensorflow model.\n\n    Args:\n        xs (List[Tuple[tf.Tensor]]): List of tuples containing the\n            input tensors (a single batch for the model).\n        model (Module or keras.Model): TF model.\n        device (Device): Device where computing the latency.\n        steps (int, optional): Number of input data to be used to compute the\n            latency of the model. It must be a number <= len(xs). Default: 100.\n        warmup_steps (int, optional): Number of input data to be used to warm\n            up the model. It must be a number <= len(xs). Default: 10.\n\n    Returns:\n        Float: Average latency.\n        List[Float]: List of latencies obtained.\n    \"\"\"\n    latencies = []\n    with tf.device(device.to_tf_format()):\n        for i in range(warmup_steps):\n            _ = model(xs[i])\n        for i in range(steps):\n            starting_time = time.time()\n            _ = model(xs[i])\n            latencies.append(time.time() - starting_time)\n        latency = np.mean(latencies)\n        return latency, latencies\n\n\ndef compute_onnx_latency(\n    xs: List[Tuple[np.array]],\n    model: str,\n    device: Device,\n    steps: int = 100,\n    warmup_steps: int = 10,\n) -> Tuple[float, List[float]]:\n    \"\"\"Compute the latency associated with the ONNX model.\n\n    Args:\n        xs (List[Tuple[np.array]]): List of tuples containing the\n            inputs (a single batch for the model).\n        model (str): ONNX model path.\n        device (Device): Device where computing the latency.\n        steps (int, optional): Number of input data to be used to compute the\n            latency of the model. It must be a number <= len(xs). Default: 100.\n        warmup_steps (int, optional): Number of input data to be used to warm\n            up the model. It must be a number <= len(xs). Default: 10.\n\n    Returns:\n        Float: Average latency.\n        List[Float]: List of latencies obtained.\n    \"\"\"\n    from nebullvm.optional_modules.onnxruntime import onnxruntime as ort\n\n    input_names = get_input_names(model)\n    output_names = get_output_names(model)\n\n    if device.type is DeviceType.GPU and len(ONNX_PROVIDERS[\"cuda\"]) == 3:\n        ONNX_PROVIDERS[\"cuda\"][1] = (\n            \"CUDAExecutionProvider\",\n            {\n                \"device_id\": device.idx,\n            },\n        )\n\n    model = ort.InferenceSession(\n        model,\n        providers=ONNX_PROVIDERS[\"cuda\"][1:]\n        if device.type is DeviceType.GPU\n        else ONNX_PROVIDERS[\"cpu\"],\n    )\n\n    latencies = []\n    for i in range(warmup_steps):\n        inputs = {name: array for name, array in zip(input_names, xs[i])}\n        _ = model.run(output_names=output_names, input_feed=inputs)\n    for i in range(steps):\n        inputs = {name: array for name, array in zip(input_names, xs[i])}\n        starting_time = time.time()\n        _ = model.run(output_names=output_names, input_feed=inputs)\n        latencies.append(time.time() - starting_time)\n    latency = np.mean(latencies)\n    return latency, latencies\n\n\ndef compute_optimized_running_time(\n    optimized_model: BaseInferenceLearner,\n    input_data: DataManager,\n    steps: int = 100,\n    min_steps: int = 5,\n    warmup_steps: int = 10,\n) -> float:\n    \"\"\"Compute the running time of the optimized model.\n\n    Args:\n        optimized_model (BaseInferenceLearner): Optimized model.\n        input_data: (DataManager): Dataset used to compute latency.\n        steps (int, optional): Number of input data to be used to\n            compute the latency of the model. Default: 100.\n        min_steps (int, optional): Minimum number of iterations to\n            be performed. Default: 5.\n        warmup_steps (int, optional): Number of input data to be used\n            to warm up the model. Default: 10.\n\n    Returns:\n        Float: Average latency.\n    \"\"\"\n\n    latencies = []\n    last_median = None\n\n    # Warmup\n    inputs_list = input_data.get_split(\"test\").get_list(warmup_steps)\n    for model_inputs in inputs_list:\n        _ = optimized_model(*model_inputs)\n\n    # Compute latency\n    inputs_list = input_data.get_split(\"test\").get_list(steps)\n    for model_inputs in inputs_list:\n        starting_time = time.time()\n        _ = optimized_model(*model_inputs)\n        latencies.append(time.time() - starting_time)\n        if len(latencies) > min_steps:\n            median = np.median(latencies)\n            diff = (\n                np.abs(median - last_median) / last_median\n                if last_median is not None\n                else 1.0\n            )\n            if diff < 0.05:\n                return median\n            last_median = median\n    return np.median(latencies)\n\n\ndef compute_relative_difference(\n    tensor_1: Any,\n    tensor_2: Any,\n    y: Any = None,\n    eps: float = 1e-5,\n) -> float:\n    if y is not None:\n        logger.debug(\n            \"Received a label for the precision computation. \"\n            \"It will be ignored.\"\n        )\n\n    tensor_1, tensor_2 = map(convert_to_numpy, (tensor_1, tensor_2))\n\n    assert tensor_1.shape == tensor_2.shape, (\n        \"The outputs of the original and optimized models have \"\n        \"different shapes\"\n    )\n\n    diff = np.abs(tensor_1 - tensor_2) / (\n        np.maximum(np.abs(tensor_1), np.abs(tensor_2)) + eps\n    )\n    return float(np.mean(diff))\n\n\ndef compute_accuracy_drop(tensor_1: Any, tensor_2: Any, y: Any) -> float:\n    assert y is not None, (\n        \"No label found in the dataloader provided. \"\n        \"To use accuracy metric, you must set also the labels\"\n    )\n    tensor_1, tensor_2, y = map(convert_to_numpy, (tensor_1, tensor_2, y))\n    accuracy_1 = np.mean(tensor_1.argmax(axis=-1) == y)\n    accuracy_2 = np.mean(tensor_2.argmax(axis=-1) == y)\n    return accuracy_1 - accuracy_2\n\n\nQUANTIZATION_METRIC_MAP = {\n    \"accuracy\": compute_accuracy_drop,\n    \"numeric_precision\": compute_relative_difference,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/base.py",
    "content": "import abc\nfrom typing import Any, Dict, List, Optional\n\nfrom nebullvm.core.models import QuantizationType\nfrom nebullvm.operations.base import Operation\n\n\nclass Compiler(Operation, abc.ABC):\n    supported_ops: Dict[str, List[Optional[QuantizationType]]]\n\n    def __init__(self):\n        super().__init__()\n        self.compiled_model = None\n\n    @abc.abstractmethod\n    def execute(self, **kwargs):\n        raise NotImplementedError()\n\n    @abc.abstractmethod\n    def _compile_model(self, **kwargs) -> Any:\n        raise NotImplementedError()\n\n    @abc.abstractmethod\n    def _quantize_model(self, **kwargs) -> Any:\n        raise NotImplementedError()\n\n    def get_result(self) -> Any:\n        return self.compiled_model\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/deepsparse.py",
    "content": "from pathlib import Path\nfrom typing import Union\n\nfrom nebullvm.core.models import (\n    ModelParams,\n    QuantizationType,\n)\nfrom nebullvm.operations.conversions.converters import (\n    PytorchConverter,\n)\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\nfrom nebullvm.optional_modules.torch import (\n    Module,\n    GraphModule,\n)\nfrom nebullvm.tools.data import DataManager\n\n\nclass DeepSparseCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [None],\n        \"gpu\": [],\n    }\n\n    def __init__(self):\n        super().__init__()\n        self.conversion_op = PytorchConverter()\n\n    def execute(\n        self,\n        model: Module,\n        onnx_output_path: str,\n        model_params: ModelParams,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Compile the input model using DeepSparse Compiler.\n\n        Args:\n            model (torch.nn.Module): The pytorch model.\n            onnx_output_path (str): Path where the converted ONNX model will be\n                stored.\n            model_params (ModelParams): The model parameters.\n            quantization_type (QuantizationType): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        self.compiled_model = self._compile_model(\n            model, onnx_output_path, input_data, model_params\n        )\n\n    def _compile_model(\n        self,\n        model: Union[Module, GraphModule],\n        onnx_output_path: str,\n        input_data: DataManager,\n        model_params: ModelParams,\n    ) -> str:\n        self.conversion_op.model_name = \"model_pruned\"\n        onnx_pruned_path = Path(onnx_output_path)\n        self.conversion_op.to(self.device).set_state(\n            model, input_data\n        ).execute(onnx_pruned_path, model_params)\n        onnx_pruned_path = str(onnx_pruned_path / \"model_pruned.onnx\")\n        return onnx_pruned_path\n\n    @staticmethod\n    def _quantize_model(**kwargs):\n        raise NotImplementedError()\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/__init__.py",
    "content": "from copy import deepcopy\nfrom typing import Union\n\nfrom nebullvm.core.models import QuantizationType, DeviceType\nfrom nebullvm.operations.optimizations.compilers.faster_transformer.bert import (  # noqa: E501\n    detect_and_swap_bert_model,\n)\nfrom nebullvm.operations.optimizations.compilers.torchscript import (\n    TorchScriptCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    get_faster_transformer_repo_path,\n)\nfrom nebullvm.optional_modules.torch import (\n    GraphModule,\n    Module,\n    ScriptModule,\n    torch,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.huggingface import PyTorchTransformerWrapper\n\ndefault_lib_path = str(\n    get_faster_transformer_repo_path()\n    / \"build\"\n    / \"lib\"\n    / \"libth_transformer.so\"\n)\n\n\ndef detect_and_swap_model(model, data_type=\"fp16\", remove_padding=False):\n    \"\"\"currently only supports:\n    - BertModel and model with BertModel as .bert attribute\n    \"\"\"\n    model = detect_and_swap_bert_model(\n        model,\n        data_type=data_type,\n        lib_path=default_lib_path,\n        remove_padding=remove_padding,\n    )\n    if data_type == \"fp16\":\n        model.half()\n    elif data_type == \"bf16\":\n        model.bfloat16()\n    return model\n\n\nclass FasterTransformerCompiler(TorchScriptCompiler):\n    supported_ops = {\n        \"cpu\": [None, QuantizationType.STATIC, QuantizationType.DYNAMIC],\n        \"gpu\": [\n            None,\n            QuantizationType.HALF,\n        ],\n    }\n\n    @torch.no_grad()\n    def _compile_model(\n        self,\n        model: Union[Module, GraphModule],\n        input_data: DataManager,\n        quantization_type: QuantizationType,\n    ) -> ScriptModule:\n        model = deepcopy(model)  # Some operations modify the model in-place\n        if isinstance(model, PyTorchTransformerWrapper):\n            # .core_model is a huggingface model\n            data_type = (\n                \"fp16\"\n                if quantization_type is QuantizationType.HALF\n                else \"fp32\"\n            )\n            model.core_model = detect_and_swap_model(\n                model.core_model, data_type=data_type, remove_padding=False\n            )\n            if self.device.type is DeviceType.GPU:\n                model.cuda()\n\n        return super()._compile_model(model, input_data, quantization_type)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/__init__.py",
    "content": "import os\n\nfrom nebullvm.operations.optimizations.compilers.faster_transformer.bert.modeling_bert import (  # noqa: E501\n    BertModel as FasterBertModel,\n)\nfrom nebullvm.operations.optimizations.compilers.faster_transformer.bert.modeling_bert import (  # noqa: E501\n    CustomEncoder,\n    EncoderWeights,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    get_faster_transformer_repo_path,\n)\nfrom nebullvm.optional_modules.huggingface import BertModel as HFBertModel\nfrom nebullvm.optional_modules.torch import torch\n\n\ndefault_lib_path = str(\n    get_faster_transformer_repo_path()\n    / \"build\"\n    / \"lib\"\n    / \"libth_transformer.so\"\n)\n\n\ndef swap_bert_encoder(model, data_type, lib_path, remove_padding=False):\n    \"\"\"\n    Replace the encoder of the model with a custom encoder\n    that uses the Faster Transformer library.\n    \"\"\"\n    weights = EncoderWeights(\n        model.config.num_hidden_layers,\n        model.config.hidden_size,\n        model.state_dict(),\n    )\n    weights.to_cuda()\n    if data_type == \"fp16\":\n        weights.to_half()\n    elif data_type == \"bf16\":\n        weights.to_bfloat16()\n    lib_path = os.path.abspath(lib_path)\n    enc = CustomEncoder(\n        model.config.num_hidden_layers,\n        model.config.num_attention_heads,\n        model.config.hidden_size // model.config.num_attention_heads,\n        weights,\n        remove_padding=remove_padding,\n        path=lib_path,\n    )\n    enc_ = torch.jit.script(enc)\n    model.replace_encoder(enc_)\n\n\ndef swap_model(\n    model: HFBertModel, data_type, lib_path, remove_padding=False\n) -> FasterBertModel:\n    # bert model need some custom code to call the custom encoder\n    # so we need to use custom bert class\n    new_model = FasterBertModel(model.config)\n    new_model.load_state_dict(model.state_dict())\n    swap_bert_encoder(new_model, data_type, lib_path, remove_padding)\n    return new_model\n\n\ndef detect_and_swap_bert_model(\n    model, data_type, lib_path=default_lib_path, remove_padding=False\n):\n    if type(model) == HFBertModel:\n        model = swap_model(model, data_type, lib_path, remove_padding)\n    if hasattr(model, \"bert\") and type(model.bert) == HFBertModel:\n        model.bert = swap_model(\n            model.bert, data_type, lib_path, remove_padding\n        )\n    return model\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/checkpoint_quantization.py",
    "content": "# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/checkpoint_quantization.py # noqa: E501\n# Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.optional_modules.torch import torch\n\nACTIVATION_AMAX_NUM = 72\nINT8O_GEMM_NUM = 8\nTRT_FUSED_MHA_AMAX_NUM = 3\nSCALE_RESERVE_NUM = 21\n\n\ndef checkpoint_quantization(\n    init_dict, sparse, ths_path=\"./lib/libth_transformer.so\"\n):\n    logger.info(\"Quantizing checkpoint ...\")\n    torch.classes.load_library(ths_path)\n    weight_quantize = torch.ops.fastertransformer.weight_quantize\n\n    def init_graph():\n        layer_num = 0\n        regex = re.compile(\"layer.\\d+\")  # noqa: W605\n        amaxTotalNum = 0\n        for name, tensor_value in init_dict.items():\n            if \"intermediate.dense.weight\" in name and amaxTotalNum == 0:\n                amaxTotalNum = (\n                    ACTIVATION_AMAX_NUM\n                    + 9 * tensor_value.size(1)\n                    + INT8O_GEMM_NUM\n                    + TRT_FUSED_MHA_AMAX_NUM\n                    + SCALE_RESERVE_NUM\n                )\n            tmp = regex.findall(name)\n            if len(tmp) < 1:\n                continue\n            num_tmp = int(tmp[0].replace(\"layer.\", \"\"))\n            if layer_num < num_tmp:\n                layer_num = num_tmp\n        layer_num = layer_num + 1\n        # add new var for amax\n        for i in range(layer_num):\n            init_dict[\n                \"bert.encoder.layer.{}.amaxList\".format(i)\n            ] = torch.zeros((amaxTotalNum,), dtype=torch.float32)\n        return layer_num, amaxTotalNum\n\n    layer_num, amaxTotalNum = init_graph()\n\n    kernel_name_list = [\n        \"attention.self.query\",\n        \"attention.self.key\",\n        \"attention.self.value\",\n        \"attention.output.dense\",\n        \"intermediate.dense\",\n        \"output.dense\",\n    ]\n\n    amax_name_list = [\n        \"attention.self.query._input_quantizer\",\n        \"attention.self.query._aftergemm_quantizer\",\n        \"attention.self.matmul_q_input_quantizer\",\n        \"attention.self.key._aftergemm_quantizer\",\n        \"attention.self.matmul_k_input_quantizer\",\n        \"attention.self.value._aftergemm_quantizer\",\n        \"attention.self.matmul_v_input_quantizer\",\n        \"attention.self.softmax_input_quantizer\",\n        \"attention.self.matmul_a_input_quantizer\",\n        \"attention.output.dense._input_quantizer\",\n        \"attention.output.dense._aftergemm_quantizer\",\n        \"intermediate.dense._input_quantizer\",\n        \"intermediate.dense._aftergemm_quantizer\",\n        \"output.dense._input_quantizer\",\n        \"output.dense._aftergemm_quantizer\",\n        \"special_F2Bias_scale\",\n    ]\n\n    int8O_gemm_weight_amax_list = [0 for i in range(INT8O_GEMM_NUM)]\n    int8O_gemm_weight_list = [\n        \"attention.self.query\",\n        \"attention.self.key\",\n        \"attention.self.value\",\n        \"attention.self.matmul_k_input_quantizer\",\n        \"attention.self.matmul_v_input_quantizer\",\n        \"attention.output.dense\",\n        \"intermediate.dense\",\n        \"output.dense\",\n    ]\n\n    int8O_gemm_input_amax_list = [0 for i in range(INT8O_GEMM_NUM)]\n    int8O_gemm_input_list = [\n        \"attention.self.query._input_quantizer\",\n        \"attention.self.key._input_quantizer\",\n        \"attention.self.value._input_quantizer\",\n        \"attention.self.matmul_q_input_quantizer\",\n        \"attention.self.matmul_a_input_quantizer\",\n        \"attention.output.dense._input_quantizer\",\n        \"intermediate.dense._input_quantizer\",\n        \"output.dense._input_quantizer\",\n    ]\n\n    int8O_gemm_output_amax_list = [0 for i in range(INT8O_GEMM_NUM)]\n    int8O_gemm_output_list = [\n        \"attention.self.query._aftergemm_quantizer\",\n        \"attention.self.key._aftergemm_quantizer\",\n        \"attention.self.value._aftergemm_quantizer\",\n        \"attention.self.softmax_input_quantizer\",\n        \"attention.output.dense._input_quantizer\",\n        \"attention.output.dense._aftergemm_quantizer\",\n        \"intermediate.dense._aftergemm_quantizer\",\n        \"output.dense._aftergemm_quantizer\",\n    ]\n\n    same_value_tuple_list = [\n        (\n            \"attention.self.query._input_quantizer\",\n            \"attention.self.key._input_quantizer\",\n            \"attention.self.value._input_quantizer\",\n            \"attention.output.add_residual_input_quantizer\",\n        ),\n        (\n            \"intermediate.dense._input_quantizer\",\n            \"output.add_residual_input_quantizer\",\n        ),\n    ]\n\n    factor = 1000000.0  # noqa: F841\n    for i in range(layer_num):\n        amaxList = np.zeros([amaxTotalNum]).astype(np.float32)\n        amax_id = 0\n        # verify some quantizers have same value.\n        # input_quantizer is per-tensor quantization\n        for same_value_tuple in same_value_tuple_list:\n            tmp_v = init_dict[\n                \"bert.encoder.layer.{}.{}._amax\".format(i, same_value_tuple[0])\n            ].numpy()\n            for same_value_name in same_value_tuple:\n                tmp_v_2 = init_dict[\n                    \"bert.encoder.layer.{}.{}._amax\".format(i, same_value_name)\n                ].numpy()\n                assert np.allclose(tmp_v, tmp_v_2)\n\n        for amax_name in amax_name_list:\n            if amax_name == \"special_F2Bias_scale\":\n                if i != layer_num - 1:\n                    quant_max = init_dict[\n                        \"bert.encoder.layer.{}.{}._amax\".format(\n                            i + 1, amax_name_list[0]\n                        )\n                    ].item()\n                    amax = abs(quant_max)\n                else:\n                    # not used, placeholder\n                    amax = 1.0\n                amaxList[amax_id] = amax\n                amax_id += 1\n                amaxList[amax_id] = amax / 127.0\n                amax_id += 1\n                amaxList[amax_id] = amax / 127.0 / 127.0\n                amax_id += 1\n                amaxList[amax_id] = 127.0 / amax\n                amax_id += 1\n                continue\n\n            quant_max = init_dict[\n                \"bert.encoder.layer.{}.{}._amax\".format(i, amax_name)\n            ].item()\n            amax = abs(quant_max)  # round(abs(quant_max)*factor)/factor\n            if amax_name in int8O_gemm_input_list:\n                int8O_gemm_input_amax_list[\n                    int8O_gemm_input_list.index(amax_name)\n                ] = amax\n                if amax_name == \"attention.self.query._input_quantizer\":\n                    int8O_gemm_input_amax_list[\n                        int8O_gemm_input_list.index(\n                            \"attention.self.key._input_quantizer\"\n                        )\n                    ] = amax\n                    int8O_gemm_input_amax_list[\n                        int8O_gemm_input_list.index(\n                            \"attention.self.value._input_quantizer\"\n                        )\n                    ] = amax\n            if amax_name in int8O_gemm_output_list:\n                int8O_gemm_output_amax_list[\n                    int8O_gemm_output_list.index(amax_name)\n                ] = amax\n            if amax_name in int8O_gemm_weight_list:\n                int8O_gemm_weight_amax_list[\n                    int8O_gemm_weight_list.index(amax_name)\n                ] = amax\n            amaxList[amax_id] = amax\n            amax_id += 1\n            amaxList[amax_id] = amax / 127.0\n            amax_id += 1\n            amaxList[amax_id] = amax / 127.0 / 127.0\n            amax_id += 1\n            amaxList[amax_id] = 127.0 / amax\n            amax_id += 1\n\n        # kernel amax starts from ACTIVATION_AMAX_NUM\n        assert amax_id == 64\n        amax_id = ACTIVATION_AMAX_NUM\n        for kernel_id, kernel_name in enumerate(kernel_name_list):\n            kernel = (\n                init_dict[\n                    \"bert.encoder.layer.{}.{}.weight\".format(i, kernel_name)\n                ]\n                .transpose(-1, -2)\n                .contiguous()\n            )\n            quant_max2 = init_dict[\n                \"bert.encoder.layer.{}.{}._weight_quantizer._amax\".format(\n                    i, kernel_name\n                )\n            ]\n            amax2 = abs(quant_max2)\n            if amax2.dim() == 0:\n                quant_max_processed = torch.full(\n                    (kernel.size(1),),\n                    amax2.item(),\n                    dtype=amax2.dtype,\n                    device=amax2.device,\n                )\n            else:\n                quant_max_processed = amax2.view(-1)\n            kernel_processed = weight_quantize(\n                kernel, quant_max_processed.cuda(), sparse\n            )\n            init_dict[\n                \"bert.encoder.layer.{}.{}.weight\".format(i, kernel_name)\n            ] = kernel_processed\n            if kernel_name in int8O_gemm_weight_list:\n                int8O_gemm_weight_amax_list[\n                    int8O_gemm_weight_list.index(kernel_name)\n                ] = quant_max_processed[0]\n            for e in quant_max_processed:\n                amaxList[amax_id] = e\n                amax_id += 1\n\n        # for int8O gemm deQuant\n        for j in range(INT8O_GEMM_NUM):\n            amaxList[amax_id] = (\n                int8O_gemm_input_amax_list[j] * int8O_gemm_weight_amax_list[j]\n            ) / (127.0 * int8O_gemm_output_amax_list[j])\n            amax_id += 1\n\n        # for trt fused MHA amax\n        # QKV_addBias_amax\n        amaxList[amax_id] = np.maximum(\n            np.maximum(amaxList[8], amaxList[16]), amaxList[24]\n        )\n        amax_id += 1\n        # softmax amax\n        amaxList[amax_id] = amaxList[32]\n        amax_id += 1\n        # bmm2 amax\n        amaxList[amax_id] = amaxList[36]\n        amax_id += 1\n\n        init_dict[\"bert.encoder.layer.{}.amaxList\".format(i)] = torch.tensor(\n            amaxList, dtype=torch.float32\n        )\n    logger.info(\"Quantizing checkpoint done.\")\n    return init_dict\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/modeling_bert.py",
    "content": "# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/modeling_bert.py  # noqa: E501\n# This file is mostly copied from the FasterTransformer repo\n# https://github.com/NVIDIA/FasterTransformer\n# Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List, Optional\n\nfrom loguru import logger\n\nfrom nebullvm.optional_modules.torch import torch, torch_distributed as dist\n\nfrom nebullvm.optional_modules.huggingface import (\n    BertConfig,\n    BertEmbeddings,\n    BertEncoder,\n    BertPooler,\n    BertPreTrainedModel,\n)\n\nfrom .checkpoint_quantization import checkpoint_quantization\n\n\nclass EncoderWeights(object):\n    def __init__(\n        self,\n        layer_num,\n        hidden_dim,\n        weights=None,\n        sparse=False,\n        tensor_para_size=1,\n        pipeline_para_size=1,\n    ):\n        \"\"\"weights need be a state_dict of bert model\"\"\"\n        self.layer_num = layer_num\n        self.int8 = False\n        self.hidden_dim = hidden_dim\n        self.weights = {}\n        self.tensor_para_size = tensor_para_size\n        self.pipeline_para_size = pipeline_para_size\n\n        self.use_mpi = dist.is_mpi_available()\n\n        if self.use_mpi:\n            try:\n                dist.init_process_group(backend=\"mpi\")\n            except:  # noqa: E722\n                logger.info(\n                    \"[INFO] WARNING: Exception occurred in \"\n                    \"dist.init_process_group(backend='mpi').\"\n                    \"Maybe the process group has been initialized somewhere else.\"  # noqa: E501\n                )\n        else:\n            logger.info(\"[INFO] MPI is not available in this PyTorch build.\")\n            assert (\n                tensor_para_size == 1\n            ), \"[FATAL] MPI is required for tensor_para_size > 1.\"\n            assert (\n                pipeline_para_size == 1\n            ), \"[FATAL] MPI is required for pipeline_para_size > 1.\"\n\n        self.rank = dist.get_rank() if self.use_mpi else 0\n        self.device_count = torch.cuda.device_count()\n        self.device = self.rank % self.device_count\n        torch.cuda.set_device(self.device)\n\n        world_size = dist.get_world_size() if self.use_mpi else 1  # noqa: F841\n        self.tensor_para_rank = self.rank % self.tensor_para_size\n        self.pipeline_para_rank = self.rank // self.tensor_para_size\n        if weights is None:\n            self._generated_weights = True\n            for i in range(layer_num):\n                pre = \"encoder.layer.\" + str(i) + \".\"\n                self.weights[\n                    pre + \"attention.self.query.weight\"\n                ] = torch.zeros(hidden_dim, hidden_dim)\n                self.weights[pre + \"attention.self.query.bias\"] = torch.zeros(\n                    hidden_dim\n                )\n                self.weights[pre + \"attention.self.key.weight\"] = torch.zeros(\n                    hidden_dim, hidden_dim\n                )\n                self.weights[pre + \"attention.self.key.bias\"] = torch.zeros(\n                    hidden_dim\n                )\n                self.weights[\n                    pre + \"attention.self.value.weight\"\n                ] = torch.zeros(hidden_dim, hidden_dim)\n                self.weights[pre + \"attention.self.value.bias\"] = torch.zeros(\n                    hidden_dim\n                )\n                self.weights[\n                    pre + \"attention.output.dense.weight\"\n                ] = torch.zeros(hidden_dim, hidden_dim)\n                self.weights[\n                    pre + \"attention.output.dense.bias\"\n                ] = torch.zeros(hidden_dim)\n                self.weights[\n                    pre + \"attention.output.LayerNorm.weight\"\n                ] = torch.zeros(hidden_dim)\n                self.weights[\n                    pre + \"attention.output.LayerNorm.bias\"\n                ] = torch.zeros(hidden_dim)\n                self.weights[pre + \"intermediate.dense.weight\"] = torch.zeros(\n                    4 * hidden_dim, hidden_dim\n                )  # noqa: E501\n                self.weights[pre + \"intermediate.dense.bias\"] = torch.zeros(\n                    4 * hidden_dim\n                )\n                self.weights[pre + \"output.dense.weight\"] = torch.zeros(\n                    hidden_dim, 4 * hidden_dim\n                )\n                self.weights[pre + \"output.dense.bias\"] = torch.zeros(\n                    hidden_dim\n                )\n                self.weights[pre + \"output.LayerNorm.weight\"] = torch.zeros(\n                    hidden_dim\n                )\n                self.weights[pre + \"output.LayerNorm.bias\"] = torch.zeros(\n                    hidden_dim\n                )\n            for k, v in self.weights.items():\n                if not k.endswith(\"_amax\"):\n                    self.weights[k] = torch.nn.init.uniform_(v, -1, 1)\n            if sparse:\n                for k, v in self.weights.items():\n                    if (\n                        \"query.weight\" in k\n                        or \"key.weight\" in k\n                        or \"value.weight\" in k\n                        or \"dense.weight\" in k\n                    ):\n                        v_shape = v.shape\n                        v = v.view(-1, 4)\n                        _, indices = torch.topk(\n                            torch.abs(v), 2, dim=-1, largest=False\n                        )\n                        v.scatter_(1, indices, 0)\n                        self.weights[k] = v.view(v_shape)\n        else:\n            self._generated_weights = False\n            for k, v in weights.items():\n                ks = k.split(\".\")\n                if ks[-2] == \"LayerNorm\":\n                    if ks[-1] == \"gamma\":\n                        ks[-1] = \"weight\"\n                    elif ks[-1] == \"beta\":\n                        ks[-1] = \"bias\"\n                self.weights[\".\".join(ks)] = v\n\n    def listed_weights(self):\n        ret = []\n        start_layer = (\n            self.pipeline_para_rank * self.layer_num // self.pipeline_para_size\n        )\n        end_layer = (\n            (self.pipeline_para_rank + 1)\n            * self.layer_num\n            // self.pipeline_para_size\n        )\n        if not self.int8:\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.query.weight\"\n                        ].transpose(-1, -2)\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 0\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.query.bias\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.key.weight\"\n                        ].transpose(-1, -2)\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 2\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.key.bias\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.value.weight\"\n                        ].transpose(-1, -2)\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 4\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.value.bias\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.output.dense.weight\"\n                        ].transpose(-1, -2)\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 6\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[1] // self.tensor_para_size, dim=1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.output.dense.bias\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.output.LayerNorm.weight\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.output.LayerNorm.bias\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"intermediate.dense.weight\"\n                        ].transpose(-1, -2)\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 10\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"intermediate.dense.bias\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"output.dense.weight\"\n                        ].transpose(-1, -2)\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 12\n            ret[-1] = (\n                ret[-1]\n                .split(ret[-1].shape[1] // self.tensor_para_size, dim=1)[\n                    self.tensor_para_rank\n                ]\n                .contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"output.dense.bias\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"output.LayerNorm.weight\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"output.LayerNorm.bias\"\n                        ]\n                        for layer_idx in range(start_layer, end_layer)\n                    ],\n                    0,\n                ).contiguous()\n            )\n        else:\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.query.weight\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 0\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.query.bias\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.key.weight\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 2\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.key.bias\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.value.weight\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 4\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.self.value.bias\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.output.dense.weight\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 6\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.output.dense.bias\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.output.LayerNorm.weight\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"attention.output.LayerNorm.bias\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"intermediate.dense.weight\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 10\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"intermediate.dense.bias\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"output.dense.weight\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )  # 12\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"output.dense.bias\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"output.LayerNorm.weight\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"output.LayerNorm.bias\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"amaxList\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n            ret.append(\n                torch.stack(\n                    [\n                        self.weights[\n                            \"encoder.layer.\"\n                            + str(layer_idx)\n                            + \".\"\n                            + \"h_amaxList\"\n                        ]\n                        for layer_idx in range(self.layer_num)\n                    ],\n                    0,\n                ).contiguous()\n            )\n        return ret\n\n    def to_cuda(self):\n        if not self.int8:\n            for k, v in self.weights.items():\n                self.weights[k] = v.cuda()\n        else:\n            h_scale_list = {}\n            for k, v in self.weights.items():\n                if \"amaxList\" in k:\n                    k_h = k.replace(\"amaxList\", \"h_amaxList\")\n                    h_scale_list[k_h] = v\n                self.weights[k] = v.cuda()\n            for k, v in h_scale_list.items():\n                self.weights[k] = v\n\n    def to_half(self):\n        if self.int8:\n            raise RuntimeError(\n                \"Cannot cast to half if the weights have been casted to int8.\"\n            )\n        for k, v in self.weights.items():\n            self.weights[k] = v.half()\n\n    def to_bfloat16(self):\n        if self.int8:\n            raise RuntimeError(\n                \"Cannot cast to bfloat16 if the weights have been casted to int8.\"  # noqa: E501\n            )\n        for k, v in self.weights.items():\n            self.weights[k] = v.bfloat16()\n\n    def to_int8(self, sparse=False, ths_path=\"./lib/libth_transformer.so\"):\n        if self._generated_weights:\n            amax_tensor_1 = torch.Tensor(self.hidden_dim).fill_(127.0)\n            amax_tensor_2 = torch.Tensor(self.hidden_dim * 4).fill_(127.0)\n            for i in range(self.layer_num):\n                pre = \"encoder.layer.\" + str(i) + \".\"\n                self.weights[\n                    pre + \"attention.self.query._input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.query._weight_quantizer._amax\"\n                ] = amax_tensor_1\n                self.weights[\n                    pre + \"attention.self.query._aftergemm_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.key._input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.key._weight_quantizer._amax\"\n                ] = amax_tensor_1\n                self.weights[\n                    pre + \"attention.self.key._aftergemm_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.value._input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.value._weight_quantizer._amax\"\n                ] = amax_tensor_1\n                self.weights[\n                    pre + \"attention.self.value._aftergemm_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.matmul_q_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.matmul_k_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.matmul_v_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.matmul_a_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.self.softmax_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.output.dense._input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.output.dense._weight_quantizer._amax\"\n                ] = amax_tensor_1\n                self.weights[\n                    pre + \"attention.output.dense._aftergemm_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.output.add_local_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"attention.output.add_residual_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"intermediate.dense._input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"intermediate.dense._weight_quantizer._amax\"\n                ] = amax_tensor_2\n                self.weights[\n                    pre + \"intermediate.dense._aftergemm_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"output.dense._input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"output.dense._weight_quantizer._amax\"\n                ] = amax_tensor_1\n                self.weights[\n                    pre + \"output.dense._aftergemm_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"output.add_local_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n                self.weights[\n                    pre + \"output.add_residual_input_quantizer._amax\"\n                ] = torch.tensor(127.0)\n        if (\n            \"encoder.layer.0.attention.self.query._input_quantizer._amax\"\n            not in self.weights\n        ):\n            raise RuntimeError(\n                \"There is no quantization node in the checkpoint, cannot be quantized to int8.\"  # noqa: E501\n            )\n        if self.int8:\n            return\n        self.int8 = True\n        for k, v in self.weights.items():\n            if k.endswith(\"bias\") or k.endswith(\"LayerNorm.weight\"):\n                self.weights[k] = v.half()\n            elif k.endswith(\"weight\"):\n                self.weights[k] = v.float().cuda()\n            else:\n                self.weights[k] = v.float().cpu()\n        self.weights = checkpoint_quantization(\n            self.weights, sparse, ths_path, verbose=False\n        )\n\n\nclass CustomEncoder(torch.nn.Module):\n    def __init__(\n        self,\n        layer_num,\n        head_num,\n        head_size,\n        weights,\n        int8_mode=0,\n        remove_padding=False,\n        sparse=False,\n        path=\"./lib/libth_transformer.so\",\n        tensor_para_size=1,\n        pipeline_para_size=1,\n    ):\n        super().__init__()\n        self.layer_num = layer_num\n        self.remove_padding = remove_padding\n        self.int8_mode = int8_mode\n        logger.info(f\"loading faster transformer library from {path}\")\n        torch.classes.load_library(path)\n\n        weights_ = weights.listed_weights()\n\n        self.use_mpi = dist.is_mpi_available()\n\n        if self.use_mpi:\n            try:\n                dist.init_process_group(backend=\"mpi\")\n            except:  # noqa: E722\n                logger.info(\n                    \"[INFO] WARNING: Exception occurred in\"\n                    \"dist.init_process_group(backend='mpi').\"\n                    \"Maybe the process group has been initialized somewhere else.\"  # noqa: E501\n                )\n        else:\n            logger.info(\"[INFO] MPI is not available in this PyTorch build.\")\n            assert (\n                tensor_para_size == 1\n            ), \"[FATAL] MPI is required for tensor_para_size > 1.\"\n            assert (\n                pipeline_para_size == 1\n            ), \"[FATAL] MPI is required for pipeline_para_size > 1.\"\n\n        if int8_mode == 0:\n            assert len(weights_) == 16\n            try:\n                self.encoders = torch.classes.FasterTransformer.Bert(\n                    *weights_,\n                    head_num,\n                    head_size,\n                    4 * head_num * head_size,\n                    remove_padding,\n                    layer_num,\n                    sparse,\n                    1.0,\n                    tensor_para_size,\n                    pipeline_para_size,\n                )\n            except:  # noqa: E722\n                # legacy ths for 20.03 image\n                self.encoders = torch.classes.FasterTransformerBert(\n                    *weights_,\n                    head_num,\n                    head_size,\n                    4 * head_num * head_size,\n                    remove_padding,\n                    layer_num,\n                    sparse,\n                    1.0,\n                    tensor_para_size,\n                    pipeline_para_size,\n                )\n        else:\n            assert len(weights_) == 18\n            assert (\n                tensor_para_size == 1\n            ), \"INT8 BERT still only support tensor_para_size = 1\"\n            assert (\n                pipeline_para_size == 1\n            ), \"INT8 BERT still only support pipeline_para_size = 1\"\n            try:\n                self.encoders = torch.classes.FasterTransformer.INT8Bert(\n                    *weights_,\n                    head_num,\n                    head_size,\n                    remove_padding,\n                    layer_num,\n                    int8_mode,\n                    sparse,\n                    1.0,\n                )\n            except:  # noqa: E722\n                # legacy ths for 20.03 image\n                self.encoders = torch.classes.FasterTransformerINT8Bert(\n                    *weights_,\n                    head_num,\n                    head_size,\n                    remove_padding,\n                    layer_num,\n                    int8_mode,\n                    sparse,\n                    1.0,\n                )\n\n    def forward(self, hidden_states, attention_mask, sequence_lengths):\n        hidden_states = self.encoders.forward(hidden_states, sequence_lengths)\n        return (hidden_states,)\n\n\nclass HuggingFaceEncoder(torch.nn.Module):\n    def __init__(self, layer_num, head_num, head_size, weights=None):\n        super().__init__()\n        hidden_dim = head_num * head_size\n        # TODO(bhsueh) The implementation of hidden_act='gelu' is differen\n        #  to FT's (and google BERT) implementation\n        # FT's implementation is equivalent to hidden_act='gelu_new',\n        # but there are some issues for int8 sparse under gelu_new\n        conf = BertConfig(\n            hidden_size=hidden_dim,\n            intermediate_size=4 * hidden_dim,\n            num_attention_heads=head_num,\n            num_hidden_layers=layer_num,\n            hidden_act=\"gelu\",\n        )\n        self.encoder = BertEncoder(conf)\n        w = {}\n        for k, v in weights.weights.items():\n            if k.startswith(\"encoder\") and not k.endswith(\"_amax\"):\n                w[k[13:]] = weights.weights[k]\n        self.encoder.load_state_dict(w)\n        self.head_mask = [None] * layer_num\n\n    def forward(self, hidden_states, attention_mask):\n        extended_attention_mask = (1.0 - attention_mask) * -10000.0\n        output = self.encoder(\n            hidden_states,\n            extended_attention_mask,\n            self.head_mask,\n            return_dict=False,\n        )\n        return output\n\n\n# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/modeling_bert.py # noqa: E501\n# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # noqa: E501\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"PyTorch BERT model modified from HuggingFace transformers. \"\"\"\n\n\nclass BertModel(BertPreTrainedModel):\n    def __init__(self, config):\n        super().__init__(config)\n        self.config = config\n\n        self.embeddings = BertEmbeddings(config)\n        self.encoder = BertEncoder(config)\n        self.pooler = BertPooler(config)\n\n        self.init_weights()\n        self.use_ext_encoder = False\n\n    def forward(\n        self,\n        input_ids: Optional[torch.Tensor] = None,\n        attention_mask: Optional[torch.Tensor] = None,\n        token_type_ids: Optional[torch.Tensor] = None,\n        position_ids: Optional[torch.Tensor] = None,\n        head_mask: Optional[torch.Tensor] = None,\n        inputs_embeds: Optional[torch.Tensor] = None,\n        encoder_hidden_states: Optional[torch.Tensor] = None,\n        encoder_attention_mask: Optional[torch.Tensor] = None,\n        past_key_values: Optional[List[torch.FloatTensor]] = None,\n        use_cache: Optional[bool] = None,\n        output_attentions: Optional[bool] = None,\n        output_hidden_states: Optional[bool] = None,\n        return_dict: Optional[bool] = None,\n    ):\n        if input_ids is not None and inputs_embeds is not None:\n            raise ValueError(\n                \"You cannot specify both input_ids and inputs_embeds at the same time\"  # noqa: E501\n            )\n        elif input_ids is not None:\n            input_shape = input_ids.size()\n        elif inputs_embeds is not None:\n            input_shape = inputs_embeds.size()[:-1]\n        else:\n            raise ValueError(\n                \"You have to specify either input_ids or inputs_embeds\"\n            )\n\n        device = (\n            input_ids.device if input_ids is not None else inputs_embeds.device\n        )\n\n        if attention_mask is None:\n            attention_mask = torch.ones(input_shape, device=device)\n        if token_type_ids is None:\n            token_type_ids = torch.zeros(\n                input_shape, dtype=torch.long, device=device\n            )\n\n        if self.use_ext_encoder:\n            # if attention_mask.dim() == 3:\n            #     extended_attention_mask = attention_mask\n            # elif attention_mask.dim() == 2:\n            #     extended_attention_mask = attention_mask[:, None, :].repeat(1, input_shape[1], 1) # noqa: E501\n            # else:\n            #     raise ValueError(\n            #         \"Wrong shape for input_ids (shape {}) or attention_mask (shape {})\".format(# noqa: E501\n            #             input_shape, attention_mask.shape\n            #         )\n            #     )\n            assert attention_mask.dim() == 2\n            extended_attention_mask = attention_mask.view(\n                -1, 1, 1, attention_mask.size(-1)\n            )\n            m_2 = extended_attention_mask.transpose(-1, -2)\n            extended_attention_mask = extended_attention_mask * m_2\n            extended_attention_mask = extended_attention_mask.to(\n                dtype=next(self.parameters()).dtype\n            )  # fp16 compatibility\n            seq_lens = torch.sum(attention_mask, 1, dtype=torch.int32).cuda()\n        else:\n            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # noqa: E501\n            # ourselves in which case we just need to make it broadcastable to all heads. # noqa: E501\n            if attention_mask.dim() == 3:\n                extended_attention_mask = attention_mask[:, None, :, :]\n            elif attention_mask.dim() == 2:\n                extended_attention_mask = attention_mask[:, None, None, :]\n            else:\n                raise ValueError(\n                    \"Wrong shape for input_ids (shape {}) or attention_mask (shape {})\".format(  # noqa: E501\n                        input_shape, attention_mask.shape\n                    )\n                )\n            # Since attention_mask is 1.0 for positions we want to attend\n            # and 0.0 for masked positions, this operation will create a\n            # tensor which is 0.0 for positions we want to attend\n            # and -10000.0 for masked positions.\n            # Since we are adding it to the raw scores before the softmax,\n            # this is effectively the same as removing these entirely.\n            extended_attention_mask = extended_attention_mask.to(\n                dtype=next(self.parameters()).dtype\n            )  # fp16 compatibility\n            extended_attention_mask = (\n                1.0 - extended_attention_mask\n            ) * -10000.0\n\n        embedding_output = self.embeddings(\n            input_ids=input_ids,\n            position_ids=position_ids,\n            token_type_ids=token_type_ids,\n            inputs_embeds=inputs_embeds,\n        )\n        if self.use_ext_encoder:\n            encoder_outputs = self.encoder(\n                embedding_output, extended_attention_mask, seq_lens\n            )\n        else:\n            head_mask = [None] * self.config.num_hidden_layers\n            encoder_outputs = self.encoder(\n                embedding_output,\n                attention_mask=extended_attention_mask,\n                head_mask=head_mask,\n            )\n\n        sequence_output = encoder_outputs[0]\n        pooled_output = self.pooler(sequence_output)\n\n        outputs = (sequence_output, pooled_output,) + encoder_outputs[\n            1:\n        ]  # add hidden_states and attentions if they are here\n        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions) # noqa: E501\n\n    def replace_encoder(self, new_encoder):\n        self.encoder = new_encoder\n        self.use_ext_encoder = True\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/__init__.py",
    "content": "# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/gpt_summarization.py # noqa: E501\n# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport tempfile\nfrom typing import Callable, Iterable, List, Optional, Tuple, Union\n\nfrom nebullvm.operations.optimizations.compilers.faster_transformer.gpt.utils import \\\n    gpt_decoder\nfrom nebullvm.operations.optimizations.compilers.faster_transformer.gpt.utils.huggingface_gpt_convert import (  # noqa: E501\n    main as convert_huggingface_gpt_to_faster_transformer,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    get_faster_transformer_repo_path,\n)\nfrom nebullvm.optional_modules.huggingface import GPT2LMHeadModel\nfrom nebullvm.optional_modules.torch import torch\n\nlib_path = default_lib_path = str(\n    get_faster_transformer_repo_path()\n    / \"build\"\n    / \"lib\"\n    / \"libth_transformer.so\"\n)\n\n\nclass FasterTransformerGPT2Wrapper(torch.nn.Module):\n    def __init__(self, model: gpt_decoder.Gpt, config):\n        super().__init__()\n        self.model = model\n        self.config = config\n        self.device = model.device\n\n    @torch.no_grad()\n    def generate(\n        self,\n        inputs: Optional[torch.Tensor] = None,\n        max_length: Optional[int] = None,\n        min_length: Optional[int] = None,\n        do_sample: Optional[bool] = None,\n        early_stopping: Optional[bool] = None,\n        num_beams: Optional[int] = 1,\n        temperature: Optional[float] = None,\n        penalty_alpha: Optional[float] = None,\n        top_k: Optional[int] = None,\n        top_p: Optional[float] = None,\n        typical_p: Optional[float] = None,\n        repetition_penalty: Optional[float] = None,\n        bad_words_ids: Optional[Iterable[int]] = None,\n        force_words_ids: Optional[\n            Union[Iterable[int], Iterable[Iterable[int]]]\n        ] = None,\n        bos_token_id: Optional[int] = None,\n        pad_token_id: Optional[int] = None,\n        eos_token_id: Optional[int] = None,\n        length_penalty: Optional[float] = None,\n        no_repeat_ngram_size: Optional[int] = None,\n        encoder_no_repeat_ngram_size: Optional[int] = None,\n        num_return_sequences: Optional[int] = None,\n        max_time: Optional[float] = None,\n        max_new_tokens: Optional[int] = None,\n        decoder_start_token_id: Optional[int] = None,\n        use_cache: Optional[bool] = None,\n        num_beam_groups: Optional[int] = None,\n        diversity_penalty: Optional[float] = None,\n        prefix_allowed_tokens_fn: Optional[\n            Callable[[int, torch.Tensor], List[int]]\n        ] = None,\n        # logits_processor: Optional[LogitsProcessorList] = None,\n        # renormalize_logits: Optional[bool] = None,\n        # stopping_criteria: Optional[StoppingCriteriaList] = None,\n        # constraints: Optional[List[Constraint]] = None,\n        output_attentions: Optional[bool] = None,\n        output_hidden_states: Optional[bool] = None,\n        output_scores: Optional[bool] = None,\n        return_dict_in_generate: Optional[bool] = None,\n        forced_bos_token_id: Optional[int] = None,\n        forced_eos_token_id: Optional[int] = None,\n        remove_invalid_values: Optional[bool] = None,\n        synced_gpus: Optional[bool] = False,\n        exponential_decay_length_penalty: Optional[Tuple[int, float]] = None,\n        suppress_tokens: Optional[List[int]] = None,\n        begin_suppress_tokens: Optional[List[int]] = None,\n        forced_decoder_ids: Optional[List[List[int]]] = None,\n    ):\n\n        input_lengths = torch.tensor(\n            [len(input) for input in inputs],\n            dtype=torch.int32,\n            device=self.model.device,\n        )\n        batch_size = len(inputs)\n\n        def convert_to_tensor_if_not(value, dtype=torch.float32):\n            if value is None:\n                return value\n            if isinstance(value, torch.Tensor):\n                return value\n            return value * torch.ones(batch_size, dtype=dtype)  # cpu tensor\n\n        top_k = convert_to_tensor_if_not(top_k, dtype=torch.int32)\n        top_p = convert_to_tensor_if_not(top_p, dtype=torch.float32)\n        temperature = convert_to_tensor_if_not(\n            temperature, dtype=torch.float32\n        )\n        repetition_penalty = convert_to_tensor_if_not(\n            repetition_penalty, dtype=torch.float32\n        )\n        min_length = convert_to_tensor_if_not(min_length, dtype=torch.int32)\n        len_penalty = convert_to_tensor_if_not(\n            length_penalty, dtype=torch.float32\n        )\n        if max_length is None:\n            # gen_length is required for faster transformer\n            # infer it from the model config\n            max_length = self.config.n_ctx\n        output_dict = self.model.generate(\n            input_token_ids=inputs,\n            input_lengths=input_lengths,\n            gen_length=max_length - len(inputs[0]),\n            eos_token_id=eos_token_id,\n            # local_batch_size=None,\n            beam_width=num_beams,\n            top_k=top_k,\n            top_p=top_p,\n            # top_p_decay: Optional[torch.FloatTensor] = None,\n            # top_p_min: Optional[torch.FloatTensor] = None,\n            # top_p_reset_ids: Optional[torch.IntTensor] = None,\n            temperature=temperature,\n            repetition_penalty=repetition_penalty,\n            # presence_penalty: Optional[torch.FloatTensor] = None,\n            min_length=min_length,\n            len_penalty=len_penalty,\n            # beam_search_diversity_rate: Optional[torch.FloatTensor] = None,\n            # stop_words_list: Optional[torch.IntTensor] = None,\n            # bad_words_list: Optional[torch.IntTensor] = None,\n            # sequence_limit_lengths: Optional[torch.IntTensor] = None,\n            # random_seed: Optional[torch.LongTensor] = None,\n            # memory_length: Optional[int] = None,\n            return_output_length=True,\n            return_log_probs=False,\n        )\n        output_token_ids = output_dict[\"output_token_ids\"]\n        output_lengths = output_dict[\"output_lengths\"]\n        # tokens = output_token_ids[0, 0, input_lengths[0]:output_lengths[0]]\n        tokens = [\n            # output_token_ids[i, 0, input_lengths[i]:output_lengths[i]]\n            output_token_ids[i, 0, : output_lengths[i]]\n            for i in range(batch_size)\n        ]\n        return tokens\n\n\ndef convert_gpt2_lm_head_model(\n    model: GPT2LMHeadModel,\n    tokenizer,\n    weight_data_type=\"fp32\",\n    data_type=\"fp16\",\n    use_fp32_to_compute_logit=False,\n):\n    \"\"\"\n    currently doens't support fp8 or multi-gpu\n    \"\"\"\n    weights_data_type = weight_data_type\n    temp_dir = tempfile.TemporaryDirectory()\n    temp_dir_path = temp_dir.name\n    ft_model_location = saved_dir = temp_dir_path + \"/gpt2\"\n    hf_config = model.config.to_dict()\n    # convert huggingface model to faster transformer model\n    convert_huggingface_gpt_to_faster_transformer(\n        saved_dir=saved_dir,\n        model=model.transformer,\n        weight_data_type=weight_data_type,\n    )\n\n    head_num = hf_config[\"n_head\"]\n    layer_num = hf_config[\"n_layer\"]\n    start_id = hf_config[\"bos_token_id\"]\n    end_id = hf_config[\"eos_token_id\"]\n    size_per_head = hf_config[\"n_embd\"] // head_num\n\n    vocab_size = tokenizer.vocab_size\n\n    tensor_para_size = 1\n    pipeline_para_size = 1\n    ckpt_path = os.path.join(ft_model_location, f\"{tensor_para_size}-gpu\")\n    max_seq_len = hf_config[\"n_ctx\"]\n    int8_mode = 0  # 0: no quantization, 1: quantize weights to int8\n    # load faster transformer model, note that the lm_head is not saved\n    # it's reconstructed during loading from the embedding weights\n    gpt = gpt_decoder.Gpt(\n        num_heads=head_num,\n        size_per_head=size_per_head,\n        num_layers=layer_num,\n        vocab_size=vocab_size,\n        start_id=start_id,\n        end_id=end_id,\n        tensor_para_size=tensor_para_size,\n        pipeline_para_size=pipeline_para_size,\n        lib_path=lib_path,\n        max_seq_len=max_seq_len,\n        int8_mode=int8_mode,\n        inference_data_type=data_type,\n        weights_data_type=weights_data_type,\n        use_fp32_to_compute_logit=use_fp32_to_compute_logit,\n    )\n    gpt.load(ckpt_path, data_type)\n    return FasterTransformerGPT2Wrapper(gpt, model.config)\n\n\n# from transformers import GPT2LMHeadModel, GPT2Tokenizer\n# tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n# tokenizer.pad_token = tokenizer.eos_token\n# model = hf_model = GPT2LMHeadModel.from_pretrained(\"gpt2\").to(\"cuda\").eval()\n# hf_config = hf_model.config.to_dict()\n\n\n# model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n# weight_data_type = weights_data_type = \"fp32\" # fp32 or fp16\n# data_type = \"fp32\" # fp32 or fp16\n# faster_model= convert_gpt2_lm_head_model(\n# model, tokenizer,\n# weight_data_type=weight_data_type,\n# data_type=data_type)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/gpt_decoder.py",
    "content": "# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/utils/gpt_decoder.py # noqa: E501\n# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nfrom abc import abstractmethod\nfrom pathlib import Path\nfrom typing import List, Literal, Optional, Union\nimport os\n\nimport numpy as np\n\nfrom . import comm\nfrom . import profiler\nfrom .gpt import GptInitModelParameters\n\nfrom nebullvm.optional_modules.torch import torch\n\nPathLike = Union[str, Path]\n\n\ndef to_numpy_dtype(maybe_str_dtype: Union[str, np.dtype]):\n    assert isinstance(maybe_str_dtype, (str, np.dtype))\n    if isinstance(maybe_str_dtype, str):\n        try:\n            dtype = {\n                \"fp16\": np.float16,\n                \"float16\": np.float16,\n                \"fp32\": np.float32,\n                \"float32\": np.float32,\n            }[maybe_str_dtype]\n        except KeyError:\n            raise ValueError(\n                f\"Cannot convert to numpy data type, got {maybe_str_dtype}\"\n            )\n    else:\n        dtype = maybe_str_dtype\n    return dtype\n\n\ndef to_torch_dtype(maybe_str_dtype: Union[str, torch.dtype]):\n\n    if isinstance(maybe_str_dtype, torch.dtype):\n        dtype = maybe_str_dtype\n    else:\n        try:\n            dtype = {\n                \"bf16\": torch.bfloat16,\n                \"fp16\": torch.float16,\n                \"fp32\": torch.float32,\n                \"bfloat16\": torch.bfloat16,\n                \"float16\": torch.float16,\n                \"float32\": torch.float32,\n            }[maybe_str_dtype]\n        except KeyError:\n            raise ValueError(\n                f\"Cannot convert to torch data type, got {maybe_str_dtype}\"\n            )\n    return dtype\n\n\ndef load_weight_from_bin(\n    checkpoint_path: PathLike,\n    shape: List[int],\n    weight_dtype: Union[str, np.dtype],\n):\n    \"\"\"Load a weight from a bin file.\n\n    # Args.\n        checkpoint_path: str or Path,\n            a checkpoint file path of an FT's layer weight.\n        shape: list of int, the shape of weight tensor.\n        weight_dtype: str or np.dtype, the data type of the stored weight.\n    \"\"\"\n    weight_dtype = to_numpy_dtype(weight_dtype)\n    return torch.from_numpy(np.fromfile(checkpoint_path, dtype=weight_dtype))\n\n\nLayernormType = Literal[\"pre_layernorm\", \"post_layernorm\"]\n\n\nclass GptLayerWeights:\n    def __init__(\n        self,\n        num_heads: int,\n        size_per_head: int,\n        inter_size: int,\n        num_layers: int,\n        tensor_para_size: int = 1,\n        pipeline_para_size: int = 1,\n        has_adapters: bool = False,\n        adapter_inter_size: int = 0,\n        int8_mode: int = 0,\n    ):\n\n        assert num_heads % tensor_para_size == 0, (\n            f\"num_heads ({num_heads}) is not multiple of \"\n            \"tensor para size ({tensor_para_size})\"\n        )\n\n        self.num_heads = num_heads\n        self.size_per_head = size_per_head\n        self.hidden_units = num_heads * size_per_head\n        self.num_layers = num_layers\n\n        self.tensor_para_size = tensor_para_size\n        self.tensor_para_rank = comm.get_tensor_para_rank()\n        self.pipeline_para_size = pipeline_para_size\n        self.pipeline_para_rank = comm.get_pipeline_para_rank()\n\n        self.has_adapters = has_adapters\n        self.adapter_inter_size = adapter_inter_size\n\n        self.local_num_layers = num_layers // pipeline_para_size\n        self.local_num_heads = num_heads // tensor_para_size\n        self.local_hidden_units = self.local_num_heads * size_per_head\n        self.local_inter_size = inter_size // tensor_para_size\n        self.local_adapter_inter_size = (\n            self.adapter_inter_size // tensor_para_size\n        )\n\n        self.weight_transpose_calibrate_quantize = None\n        assert int8_mode in [0, 1], \"Invalid int8 mode for GPT. Must be 0 or 1\"\n        self.int8_mode = int8_mode\n        if self.int8_mode == 1:\n            quant = (\n                torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix  # noqa: E501\n            )\n            self.weight_transpose_calibrate_quantize = lambda x: quant(\n                x, torch.int8\n            )\n\n        self.weights = None\n        self.int8_weights = None\n        self.int8_scales = None\n\n        self.expected_weight_shapes = list()\n\n        # pylint:disable=line-too-long\n        # Transformer blocks\n        self.expected_weight_shapes.extend(\n            [(self.hidden_units,)] * self.local_num_layers\n        )  # input layernorm weight\n        self.expected_weight_shapes.extend(\n            [(self.hidden_units,)] * self.local_num_layers\n        )  # input layernorm bias\n        self.expected_weight_shapes.extend(\n            [(self.hidden_units, self.local_hidden_units * 3)]\n            * self.local_num_layers\n        )  # attention qkv weight\n        self.expected_weight_shapes.extend(\n            [(self.local_hidden_units * 3,)] * self.local_num_layers\n        )  # attention qkv bias\n        self.expected_weight_shapes.extend(\n            [(self.local_hidden_units, self.hidden_units)]\n            * self.local_num_layers\n        )  # attention dense weight\n        self.expected_weight_shapes.extend(\n            [(self.hidden_units,)] * self.local_num_layers\n        )  # attention dense bias\n        self.expected_weight_shapes.extend(\n            [(self.hidden_units,)] * self.local_num_layers\n        )  # post attention layernorm weight\n        self.expected_weight_shapes.extend(\n            [(self.hidden_units,)] * self.local_num_layers\n        )  # post attention layernorm bias\n        self.expected_weight_shapes.extend(\n            [(self.hidden_units, self.local_inter_size)]\n            * self.local_num_layers\n        )  # ffn_kernel1\n        self.expected_weight_shapes.extend(\n            [(self.local_inter_size,)] * self.local_num_layers\n        )  # ffn_bias1\n        self.expected_weight_shapes.extend(\n            [(self.local_inter_size, self.hidden_units)]\n            * self.local_num_layers\n        )  # ffn_kernel2\n        self.expected_weight_shapes.extend(\n            [(self.hidden_units,)] * self.local_num_layers\n        )  # ffn_bias2\n\n        # Adapters\n        if self.has_adapters:\n            self.expected_weight_shapes.extend(\n                [(self.hidden_units, self.local_adapter_inter_size)]\n                * self.local_num_layers\n            )  # adaptor1_kernel1\n            self.expected_weight_shapes.extend(\n                [(self.local_adapter_inter_size,)] * self.local_num_layers\n            )  # adaptor1_bias1\n            self.expected_weight_shapes.extend(\n                [(self.local_adapter_inter_size, self.hidden_units)]\n                * self.local_num_layers\n            )  # adaptor1_kernel2\n            self.expected_weight_shapes.extend(\n                [(self.hidden_units,)] * self.local_num_layers\n            )  # adaptor1_bias2\n            self.expected_weight_shapes.extend(\n                [(self.hidden_units, self.local_adapter_inter_size)]\n                * self.local_num_layers\n            )  # adaptor2_kernel1\n            self.expected_weight_shapes.extend(\n                [(self.local_adapter_inter_size,)] * self.local_num_layers\n            )  # adaptor2_bias1\n            self.expected_weight_shapes.extend(\n                [(self.local_adapter_inter_size, self.hidden_units)]\n                * self.local_num_layers\n            )  # adaptor2_kernel2\n            self.expected_weight_shapes.extend(\n                [(self.hidden_units,)] * self.local_num_layers\n            )  # adaptor2_bias2\n        # pylint:enable=line-too-long\n\n    @classmethod\n    def from_config(cls, config: GptInitModelParameters):\n        return cls(\n            num_heads=config.head_num,\n            size_per_head=config.size_per_head,\n            inter_size=4 * config.head_num * config.size_per_head,\n            num_layers=config.layer_num,\n            tensor_para_size=config.tensor_para_size,\n            pipeline_para_size=config.pipeline_para_size,\n            has_adapters=config.has_adapters,\n            adapter_inter_size=config.adapter_inter_size,\n            int8_mode=config.int8_mode,\n        )\n\n    @property\n    def dtype(self):\n        return self.weights[0].dtype\n\n    @property\n    def device(self):\n        return self.weights[0].device\n\n    def _map(self, func):\n        for i in range(len(self.weights)):\n            if isinstance(self.weights[i], list):\n                for j in range(len(self.weights[i])):\n                    self.weights[i][j] = func(self.weights[i][j])\n            else:\n                self.weights[i] = func(self.weights[i])\n\n    def _map_int8(self, func):\n        for i in range(len(self.int8_weights)):\n            if isinstance(self.int8_weights[i], list):\n                for j in range(len(self.int8_weights[i])):\n                    self.int8_weights[i][j] = func(self.int8_weights[i][j])\n\n            else:\n                self.int8_weights[i] = func(self.int8_weights[i])\n        for i in range(len(self.int8_scales)):\n            if isinstance(self.int8_scales[i], list):\n                for j in range(len(self.int8_scales[i])):\n                    self.int8_scales[i][j] = func(self.int8_scales[i][j])\n            else:\n                self.int8_scales[i] = func(self.int8_scales[i])\n\n    def float(self):\n        if self.dtype == torch.float32:\n            return\n        self._map(lambda x: x.float())\n\n    def half(self):\n        if self.dtype == torch.float16:\n            return\n        self._map(lambda x: x.half())\n        if self.int8_mode == 1:\n            self._map_int8(lambda w: w.half())\n\n    def bfloat16(self):\n        if self.dtype == torch.bfloat16:\n            return\n        self._map(lambda x: x.bfloat16())\n        if self.int8_mode == 1:\n            self._map_int8(lambda w: w.bfloat16())\n\n    def cuda(self, device=None):\n        self._map(lambda x: x.cuda(device))\n        if self.int8_mode == 1:\n            self._map_int8(lambda x: x.cuda(device))\n\n    def to(self, device=None):\n        self._map(lambda x: x.to(device))\n        if self.int8_mode == 1:\n            self._map_int8(lambda x: x.to(device))\n\n    def is_valid_pp_group(self, layer, pp_rank):\n        return layer // self.layers_per_device == pp_rank\n\n    def load(\n        self,\n        checkpoint_path: PathLike,\n        compute_dtype: torch.dtype,\n        weight_dtype: Optional[Union[str, np.dtype]] = None,\n        device: Optional[Union[int, str, torch.device]] = None,\n    ):\n        \"\"\"Load checkpoint weights.\n\n        # Args.\n            checkpoint_path: str or Path,\n                a checkpoint directory where FT checkpoint files locate.\n            weight_dtype: str or np.dtype, the data type of stored weights.\n        \"\"\"\n\n        checkpoint_path = Path(checkpoint_path)\n        if not checkpoint_path.exists():\n            raise FileNotFoundError(\n                f\"Could not find checkpoint {str(checkpoint_path)}\"\n            )\n\n        weight_dtype = to_numpy_dtype(weight_dtype)\n        print(\n            f\"Load weights from {str(checkpoint_path)} (data type: {weight_dtype}\"  # noqa: E501\n        )\n\n        self.weights = list()\n        self.int8_weights = list()\n        self.int8_scales = list()\n        torch.cuda.empty_cache()\n\n        def _load_from_file(fname):\n            quant_sub_names = [\n                \"attention.query_key_value.weight\",\n                \"attention.dense.weight\",\n                \"dense_h_to_4h.weight\",\n                \"dense_4h_to_h.weight\",\n            ]\n            _weight = torch.from_numpy(\n                np.fromfile(checkpoint_path / fname, dtype=weight_dtype)\n            )\n            _weight = _weight.to(compute_dtype)\n            weight_index = len(self.weights)\n            expected_shape = self.expected_weight_shapes[weight_index]\n\n            try:\n                if _weight.nelement() > 0:\n                    _weight = _weight.reshape(expected_shape)\n            except:  # noqa: E722\n                raise ValueError(\n                    f\"num_heads, size_per_head, vocab_size, and max_seq_len must be the same \"  # noqa: E501\n                    f\"as the ones during training (weight: {fname} expected shape: {expected_shape}, \"  # noqa: E501\n                    f\"got shape: {_weight.shape}).\"\n                )\n\n            should_quantize = any(\n                sub_name in fname for sub_name in quant_sub_names\n            )\n            if self.int8_mode != 0 and should_quantize:\n                calibrate = self.weight_transpose_calibrate_quantize\n                int8_weight, int8_scales = calibrate(_weight)\n\n                # int8 weights should appear in same order as FP weights.\n                # Move to device and add to the int8 list.\n                dummy_weight = torch.empty(0, dtype=compute_dtype)\n                if device is not None:\n                    int8_weight = int8_weight.to(device)\n                    int8_scales = int8_scales.to(device)\n                    dummy_weight = dummy_weight.to(device)\n\n                self.int8_weights.append(int8_weight)\n                self.int8_scales.append(int8_scales)\n                self.weights.append(dummy_weight)\n            else:\n                if device is not None:\n                    _weight = _weight.to(device)\n                self.weights.append(_weight)\n\n        # Load\n        # pylint:disable=line-too-long\n        layer_offset = self.local_num_layers * self.pipeline_para_rank\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.input_layernorm.weight.bin\"\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.input_layernorm.bias.bin\"\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.attention.query_key_value.weight.{self.tensor_para_rank}.bin\"  # noqa: E501\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.attention.query_key_value.bias.{self.tensor_para_rank}.bin\"  # noqa: E501\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.attention.dense.weight.{self.tensor_para_rank}.bin\"  # noqa: E501\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.attention.dense.bias.bin\"\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.post_attention_layernorm.weight.bin\"  # noqa: E501\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.post_attention_layernorm.bias.bin\"  # noqa: E501\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.mlp.dense_h_to_4h.weight.{self.tensor_para_rank}.bin\"  # noqa: E501\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.mlp.dense_h_to_4h.bias.{self.tensor_para_rank}.bin\"  # noqa: E501\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.mlp.dense_4h_to_h.weight.{self.tensor_para_rank}.bin\"  # noqa: E501\n            )\n            for i in range(self.local_num_layers)\n        ]\n        [\n            _load_from_file(\n                f\"model.layers.{layer_offset + i}.mlp.dense_4h_to_h.bias.bin\"\n            )\n            for i in range(self.local_num_layers)\n        ]\n\n        if self.has_adapters:\n            [\n                _load_from_file(\n                    f\"model.layers.{layer_offset + i}.after_attention_adapter.dense_h_to_4h.weight.{self.tensor_para_rank}.bin\"  # noqa: E501\n                )\n                for i in range(self.local_num_layers)\n            ]\n            [\n                _load_from_file(\n                    f\"model.layers.{layer_offset + i}.after_attention_adapter.dense_h_to_4h.bias.{self.tensor_para_rank}.bin\"  # noqa: E501\n                )\n                for i in range(self.local_num_layers)\n            ]\n            [\n                _load_from_file(\n                    f\"model.layers.{layer_offset + i}.after_attention_adapter.dense_4h_to_h.weight.{self.tensor_para_rank}.bin\"  # noqa: E501\n                )\n                for i in range(self.local_num_layers)\n            ]\n            [\n                _load_from_file(\n                    f\"model.layers.{layer_offset + i}.after_attention_adapter.dense_4h_to_h.bias.bin\"  # noqa: E501\n                )\n                for i in range(self.local_num_layers)\n            ]\n            [\n                _load_from_file(\n                    f\"model.layers.{layer_offset + i}.after_ffn_adapter.dense_h_to_4h.weight.{self.tensor_para_rank}.bin\"  # noqa: E501\n                )\n                for i in range(self.local_num_layers)\n            ]\n            [\n                _load_from_file(\n                    f\"model.layers.{layer_offset + i}.after_ffn_adapter.dense_h_to_4h.bias.{self.tensor_para_rank}.bin\"  # noqa: E501\n                )\n                for i in range(self.local_num_layers)\n            ]\n            [\n                _load_from_file(\n                    f\"model.layers.{layer_offset + i}.after_ffn_adapter.dense_4h_to_h.weight.{self.tensor_para_rank}.bin\"  # noqa: E501\n                )\n                for i in range(self.local_num_layers)\n            ]\n            [\n                _load_from_file(\n                    f\"model.layers.{layer_offset + i}.after_ffn_adapter.dense_4h_to_h.bias.bin\"  # noqa: E501\n                )\n                for i in range(self.local_num_layers)\n            ]\n\n        assert len(self.weights) == len(\n            self.expected_weight_shapes\n        ), \"Incorrect number of weights loaded\"\n\n\nclass FtModuleBase:\n    def __init__(self):\n        self.weight = None\n\n    @classmethod\n    @abstractmethod\n    def from_config(cls, config: GptInitModelParameters, **kwargs):\n        raise NotImplementedError\n\n    @abstractmethod\n    def _initialize_model(self, force_init=False):\n        raise NotImplementedError\n\n    @abstractmethod\n    def forward(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def set_weight(self, weight: GptLayerWeights):\n        old_weight_dtype = (\n            self.weight.dtype if self.weight is not None else None\n        )\n        self.weight = weight\n        if old_weight_dtype is None or old_weight_dtype != self.weight.dtype:\n            self._initialize_model(force_init=True)\n\n    @property\n    def dtype(self):\n        assert self.weight is not None\n        return self.weight.dtype\n\n    @property\n    def device(self):\n        assert self.weight is not None\n        return self.weight.device\n\n    def cuda(self, device=None):\n        assert torch.cuda.is_available()\n        self.weight.cuda(device)\n        return self\n\n    def to(self, device=None):\n        self.weight.to(device)\n        return self\n\n    def float(self):\n        self.weight.float()\n        self._initialize_model(force_init=True)\n        return self\n\n    def half(self):\n        self.weight.half()\n        self._initialize_model(force_init=True)\n        return self\n\n    def bfloat16(self):\n        self.weight.bfloat16()\n        self._initialize_model(force_init=True)\n        return self\n\n\nclass GptContextDecoder(FtModuleBase):\n    def __init__(\n        self,\n        num_heads: int,\n        size_per_head: int,\n        inter_size: int,\n        num_layers: int,\n        tensor_para_size: int = 1,\n        pipeline_para_size: int = 1,\n        remove_padding: bool = True,\n        shared_contexts_ratio: float = 1.0,\n        layernorm_eps: float = 1e-6,\n        layernorm_type: LayernormType = \"pre_layernorm\",\n        activation_type: str = \"gelu\",\n        has_adapters: bool = False,\n        adapter_inter_size: int = 0,\n        int8_mode: int = 0,\n    ):\n        super().__init__()\n        self.num_heads = num_heads\n        self.size_per_head = size_per_head\n        self.hidden_size = self.num_heads * self.size_per_head\n        self.inter_size = inter_size\n        self.num_layers = num_layers\n\n        self.tensor_para_size = tensor_para_size\n        self.pipeline_para_size = pipeline_para_size\n\n        self.remove_padding = remove_padding\n        self.shared_contexts_ratio = shared_contexts_ratio\n\n        self.layernorm_eps = layernorm_eps\n        self.layernorm_type = layernorm_type\n        self.activation_type = activation_type\n        self.has_adapters = has_adapters\n        self.adapter_inter_size = adapter_inter_size\n\n        assert int8_mode in [0, 1]\n        self.int8_mode = int8_mode\n\n        self.ft_op = None\n        self.weight = None\n\n    def __repr__(self):\n        args_dict = dict(\n            num_heads=self.num_heads,\n            size_per_head=self.size_per_head,\n            hidden_size=self.hidden_size,\n            inter_size=self.inter_size,\n            num_layers=self.num_layers,\n            tensor_para_size=self.tensor_para_size,\n            pipeline_para_size=self.pipeline_para_size,\n            remove_padding=self.remove_padding,\n            shared_contexts_ratio=self.shared_contexts_ratio,\n            layernorm_eps=self.layernorm_eps,\n            layernorm_type=self.layernorm_type,\n            activation_type=self.activation_type,\n            has_adapters=self.has_adapters,\n            adapter_inter_size=self.adapter_inter_size,\n            int8_mode=self.int8_mode,\n        )\n        args_str = \",\\n    \".join([f\"{k}: {v}\" for k, v in args_dict.items()])\n        return f\"{self.__class__.__name__}[\\n{    args_str}\\n]\"\n\n    @classmethod\n    def from_config(cls, config: GptInitModelParameters, **kwargs):\n        return cls(\n            num_heads=config.head_num,\n            size_per_head=config.size_per_head,\n            inter_size=4 * config.head_num * config.size_per_head,\n            num_layers=config.layer_num,\n            tensor_para_size=config.tensor_para_size,\n            pipeline_para_size=config.pipeline_para_size,\n            remove_padding=kwargs.get(\"remove_padding\", True),\n            shared_contexts_ratio=kwargs.get(\"shared_contexts_ratio\", 1.0),\n            layernorm_eps=config.layernorm_eps,\n            layernorm_type=config.layernorm_type,\n            activation_type=config.activation_type,\n            has_adapters=config.has_adapters,\n            adapter_inter_size=config.adapter_inter_size,\n            int8_mode=config.int8_mode,\n        )\n\n    def _initialize_model(self, force_init=False):\n        if self.weight is None:\n            self.weight = GptLayerWeights(\n                num_heads=self.num_heads,\n                size_per_head=self.size_per_head,\n                inter_size=self.inter_size,\n                num_layers=self.num_layers,\n                tensor_para_size=self.tensor_para_size,\n                pipeline_para_size=self.pipeline_para_size,\n                has_adapters=self.has_adapters,\n                adapter_inter_size=self.adapter_inter_size,\n                int8_mode=self.int8_mode,\n            )\n        if not force_init and self.ft_op is not None:\n            return\n        if self.ft_op is not None:\n            del self.ft_op\n\n        self.ft_op = (\n            torch.classes.FasterTransformer.ParallelGptContextDecoderOp(\n                self.num_heads,\n                self.size_per_head,\n                self.inter_size,\n                self.num_layers,\n                self.tensor_para_size,\n                self.pipeline_para_size,\n                self.layernorm_eps,\n                self.layernorm_type,\n                self.activation_type,\n                self.has_adapters,\n                self.adapter_inter_size,\n                self.int8_mode,\n                self.weight.weights,\n                self.weight.int8_weights,\n                self.weight.int8_scales,\n                self.remove_padding,\n            )\n        )\n\n    def forward(\n        self,\n        input_embeds: torch.Tensor,\n        attention_mask: torch.Tensor,\n        input_lengths: torch.IntTensor,\n        memory_length: Optional[int] = None,\n        compact_index: Optional[torch.IntTensor] = None,\n        batch_to_compact_index: Optional[torch.IntTensor] = None,\n        linear_bias_slopes: Optional[torch.Tensor] = None,\n    ):\n        \"\"\"\n\n        # Args.\n            input_embeds: Tensor, (batch * beam, max_input_length, hidden_dim),\n                input hidden states.\n            attention_mask: Tensor, (batch * beam, max_input_length, max_input_length),\n                input attention mask.\n            input_lengths: (batch * beam,), input sequence lengths.\n            memory_length: int, the length of memory to keep key/cache values.\n            compact_index: IntTensor, (compact_batch_size,)\n                The index of input sequences of a compact batch. If None, the FT op\n                doesn't apply the shared context feature and as result the inference\n                time may increase.\n            batch_to_compact_index: IntTensor, (batch * beam,)\n                The index map from the original input batch to the compact batch.\n                This must be provided if compact_index is not None.\n            linear_bias_slopes: (num_heads,)\n                The slope per head of linear attention bias - ALiBi. If None, a base\n                self attention will be performed.\n        # Returns\n            hidden_states: Tensor, (batch * beam, max_input_length, hidden_dim),\n                decoder outputs.\n            key_cache: Tensor, (num_layers, batch * beam, local_num_heads, size_per_head / x, memory_length, x), # noqa: E501\n                key cache of attention of inputs.\n                x = 16 / sizeof(T), memory_length = max_input_length or max_input_length + gen_length # noqa: E501\n            value_cache: Tensor, (num_layers, batch * beam, local_num_heads, memory_length, hidden_dim) # noqa: E501\n                value cache of attention\n            last_token_hidden_states: Tensor, (batch * beam, hidden_dim)\n                hidden states of the last input token.\n        \"\"\"\n        self._initialize_model()\n        # outputs: output hidden states\n        (\n            decoder_ouptut,\n            key_cache,\n            value_cache,\n            last_token_hidden_states,\n        ) = self.ft_op.forward(\n            input_embeds,\n            attention_mask,\n            input_lengths,\n            memory_length,\n            compact_index,\n            batch_to_compact_index,\n            linear_bias_slopes,\n        )\n        return decoder_ouptut, key_cache, value_cache, last_token_hidden_states\n\n\nclass GptDecoder(FtModuleBase):\n    def __init__(\n        self,\n        num_heads: int,\n        size_per_head: int,\n        inter_size: int,\n        num_layers: int,\n        tensor_para_size: int = 1,\n        pipeline_para_size: int = 1,\n        layernorm_eps: float = 1e-6,\n        layernorm_type: LayernormType = \"pre_layernorm\",\n        activation_type: str = \"gelu\",\n        has_adapters: bool = False,\n        adapter_inter_size: int = 0,\n        int8_mode: int = 0,\n    ):\n        super().__init__()\n        self.num_heads = num_heads\n        self.size_per_head = size_per_head\n        self.hidden_size = self.num_heads * self.size_per_head\n        self.inter_size = inter_size\n        self.num_layers = num_layers\n\n        self.tensor_para_size = tensor_para_size\n        self.pipeline_para_size = pipeline_para_size\n\n        self.layernorm_eps = layernorm_eps\n        self.layernorm_type = layernorm_type\n        self.activation_type = activation_type\n        self.has_adapters = has_adapters\n        self.adapter_inter_size = adapter_inter_size\n\n        self.int8_mode = int8_mode\n\n        self.ft_op = None\n        self.weight = None\n\n    def __repr__(self):\n        args_dict = dict(\n            num_heads=self.num_heads,\n            size_per_head=self.size_per_head,\n            hidden_size=self.hidden_size,\n            inter_size=self.inter_size,\n            num_layers=self.num_layers,\n            tensor_para_size=self.tensor_para_size,\n            pipeline_para_size=self.pipeline_para_size,\n            layernorm_eps=self.layernorm_eps,\n            layernorm_type=self.layernorm_type,\n            activation_type=self.activation_type,\n            has_adapters=self.has_adapters,\n            adapter_inter_size=self.adapter_inter_size,\n            int8_mode=self.int8_mode,\n        )\n        args_str = \",\\n    \".join(\n            [f\"{k}: {v}\" for k, v in args_dict.items()]\n        )  # noqa: E501\n        return f\"{self.__class__.__name__}[\\n    {args_str}\\n]\"\n\n    @classmethod\n    def from_config(cls, config: GptInitModelParameters, **kwargs):\n        hidden_dim = config.head_num * config.size_per_head\n        return cls(\n            num_heads=config.head_num,\n            size_per_head=config.size_per_head,\n            inter_size=4 * hidden_dim,\n            num_layers=config.layer_num,\n            tensor_para_size=config.tensor_para_size,\n            pipeline_para_size=config.pipeline_para_size,\n            layernorm_eps=config.layernorm_eps,\n            layernorm_type=config.layernorm_type,\n            activation_type=config.activation_type,\n            has_adapters=config.has_adapters,\n            adapter_inter_size=config.adapter_inter_size,\n            int8_mode=config.int8_mode,\n        )\n\n    def _initialize_model(self, force_init=False):\n        if self.weight is None:\n            self.weight = GptLayerWeights(\n                num_heads=self.num_heads,\n                size_per_head=self.size_per_head,\n                inter_size=self.inter_size,\n                num_layers=self.num_layers,\n                tensor_para_size=self.tensor_para_size,\n                pipeline_para_size=self.pipeline_para_size,\n                has_adapters=self.has_adapters,\n                adapter_inter_size=self.adapter_inter_size,\n                int8_mode=self.int8_mode,\n            )\n        if not force_init and self.ft_op is not None:\n            return\n        if self.ft_op is not None:\n            del self.ft_op\n        self.ft_op = torch.classes.FasterTransformer.ParallelGptDecoderOp(\n            self.num_heads,\n            self.size_per_head,\n            self.inter_size,\n            self.num_layers,\n            self.tensor_para_size,\n            self.pipeline_para_size,\n            self.layernorm_eps,\n            self.layernorm_type,\n            self.activation_type,\n            self.has_adapters,\n            self.adapter_inter_size,\n            self.weight.int8_mode,\n            self.weight.weights,\n            self.weight.int8_weights,\n            self.weight.int8_scales,\n        )\n\n    def forward(\n        self,\n        max_input_length: int,\n        step: int,\n        ite: int,\n        input_embeds: torch.Tensor,\n        sequence_lengths: torch.IntTensor,\n        key_cache: torch.Tensor,\n        value_cache: torch.Tensor,\n        finished: torch.BoolTensor,\n        total_padding_tokens: torch.IntTensor,\n        masked_tokens: torch.BoolTensor,\n        cache_indirection: Optional[torch.IntTensor] = None,\n        linear_bias_slopes: Optional[torch.Tensor] = None,\n    ):\n        \"\"\"\n\n        # Args.\n            max_input_length: int, maximum input context length.\n            step: int, the current step index.\n            ite: int, local batch iteration.\n            input_embeds: Tensor, (local_batch * beam, hidden_dim),\n                input hidden state to decoder.\n            sequence_lengths: IntTensor, (local_batch * beam,),\n                the current sequence lengths.\n            key_cache: Tensor, key cache buffer.\n            value_cache: Tensor, value cache buffer.\n            finished: BoolTensor, (local_batch * beam,),\n                whether to finish sentence generation.\n            total_padding_tokens IntTensor, (local_batch * beam,),\n                the number of padded tokens.\n            masked_tokens: BoolTensor, (local_batch * beam, memory_length),\n                a mask tensor that indicates padded tokens.\n            cache_indirection: IntTensor, (local_batch * beam,),\n                cache of beam positions if needed if beam > 1.\n            linear_bias_slopes Tensor, (num_heads,)\n                slopes head of linear position bias (ALiBi) (optional).\n        # Returns\n            IntTensor, (batch * beam,) output token ids.\n        \"\"\"\n\n        self._initialize_model()\n\n        outputs = self.ft_op.forward(\n            max_input_length,\n            step,\n            ite,\n            input_embeds,\n            sequence_lengths,\n            finished,\n            total_padding_tokens,\n            masked_tokens,\n            key_cache,\n            value_cache,\n            cache_indirection,\n            linear_bias_slopes,\n        )\n        return outputs[0]\n\n\nclass Gpt:\n    def __init__(\n        self,\n        num_heads: int,\n        size_per_head: int,\n        num_layers: int,\n        vocab_size: int,\n        start_id: int,\n        end_id: int,\n        lib_path: PathLike,\n        tensor_para_size: int = 1,\n        pipeline_para_size: int = 1,\n        remove_padding: bool = True,\n        shared_contexts_ratio: float = 1.0,\n        layernorm_eps: float = 1e-6,\n        layernorm_type: LayernormType = \"pre_layernorm\",\n        activation_type: str = \"gelu\",\n        has_positional_encoding: bool = True,\n        max_seq_len: int = 0,\n        has_pre_decoder_layernorm: bool = False,\n        has_post_decoder_layernorm: bool = True,\n        has_adapters: bool = False,\n        adapter_inter_size: int = 0,\n        int8_mode: int = 0,\n        inference_data_type: Optional[str] = None,\n        weights_data_type: str = \"fp32\",\n        use_fp32_to_compute_logit: bool = False,\n        **kwargs,\n    ):\n        super().__init__()\n\n        inference_data_type = inference_data_type or weights_data_type\n\n        self.config = GptInitModelParameters(\n            head_num=num_heads,\n            size_per_head=size_per_head,\n            layer_num=num_layers,\n            max_seq_len=max_seq_len,\n            tensor_para_size=tensor_para_size,\n            vocab_size=vocab_size,\n            start_id=start_id,\n            end_id=end_id,\n            pipeline_para_size=pipeline_para_size,\n            data_type=inference_data_type,\n            weights_data_type=weights_data_type,\n            layernorm_eps=layernorm_eps,\n            layernorm_type=layernorm_type,\n            activation_type=activation_type,\n            has_positional_encoding=has_positional_encoding,\n            has_pre_decoder_layernorm=has_pre_decoder_layernorm,\n            has_post_decoder_layernorm=has_post_decoder_layernorm,\n            has_adapters=has_adapters,\n            adapter_inter_size=adapter_inter_size,\n            int8_mode=int8_mode,\n            sparse=kwargs.get(\"sparse\", False),\n        )\n        self.use_fp32_to_compute_logit = use_fp32_to_compute_logit\n\n        self.weight = None\n        self.shared_contexts_ratio = shared_contexts_ratio\n\n        torch.classes.load_library(os.path.abspath(lib_path))\n\n        # Embeddings to encode or decode tokens.\n        hidden_dim = num_heads * size_per_head\n\n        # Pad vocab size for FT.\n        local_vocab_size = math.ceil(\n            self.config.vocab_size / self.config.tensor_para_size\n        )\n        if self.config.data_type == \"fp16\":\n            local_vocab_size = math.ceil(local_vocab_size / 8) * 8\n        self.vocab_size_padded = (\n            local_vocab_size * self.config.tensor_para_size\n        )\n        self.vocab_size = self.config.vocab_size\n\n        self.decode_op = torch.classes.FasterTransformer.DynamicDecodeOp(\n            self.vocab_size,\n            self.vocab_size_padded,\n            self.config.tensor_para_size,\n            self.config.pipeline_para_size,\n            torch.float,\n        )\n\n        self._parameters = {}\n\n        def register_param(name, p):\n            self._parameters[name] = p\n            setattr(self, name, p)\n\n        register_param(\n            \"context_decoder\",\n            GptContextDecoder.from_config(\n                self.config,\n                remove_padding=remove_padding,\n                shared_contexts_ratio=shared_contexts_ratio,\n                **kwargs,\n            ),\n        )\n        register_param(\n            \"decoder\", GptDecoder.from_config(self.config, **kwargs)\n        )\n\n        compute_dtype = to_torch_dtype(inference_data_type)\n\n        if comm.is_pipeline_group_first():\n            register_param(\n                \"word_embedding\",\n                torch.nn.Embedding(\n                    self.vocab_size_padded, hidden_dim, dtype=compute_dtype\n                ),\n            )\n            self._mask_padded_vocab_weights(self.word_embedding.weight)\n            if self.config.has_positional_encoding:\n                register_param(\n                    \"position_encoding\",\n                    torch.nn.Embedding(\n                        self.config.max_seq_len,\n                        hidden_dim,\n                        dtype=compute_dtype,\n                    ),\n                )\n            else:\n                self.position_encoding = None\n            if self.config.has_pre_decoder_layernorm:\n                register_param(\n                    \"pre_decoder_layernorm\",\n                    torch.nn.LayerNorm(\n                        hidden_dim, eps=layernorm_eps, dtype=compute_dtype\n                    ),\n                )\n            else:\n                self.pre_decoder_layernorm = None\n\n        if comm.is_pipeline_group_last():\n            if has_post_decoder_layernorm:\n                register_param(\n                    \"post_decoder_layernorm\",\n                    torch.nn.LayerNorm(\n                        hidden_dim, eps=layernorm_eps, dtype=compute_dtype\n                    ),\n                )\n            else:\n                self.post_decoder_layernorm = None\n            self.lm_head_ctype = (\n                compute_dtype\n                if not self.use_fp32_to_compute_logit\n                else torch.float32\n            )\n            register_param(\n                \"lm_head\",\n                torch.nn.Linear(\n                    hidden_dim,\n                    self.vocab_size_padded,\n                    bias=False,\n                    dtype=self.lm_head_ctype,\n                ),\n            )\n            self._mask_padded_vocab_weights(self.lm_head.weight)\n\n    @classmethod\n    def from_config(cls, config: GptInitModelParameters, **kwargs):\n        return cls(\n            num_heads=config.head_num,\n            size_per_head=config.size_per_head,\n            num_layers=config.layer_num,\n            max_seq_len=config.max_seq_len,\n            tensor_para_size=config.tensor_para_size,\n            vocab_size=config.vocab_size,\n            start_id=config.start_id,\n            end_id=config.end_id,\n            pipeline_para_size=config.pipeline_para_size,\n            inference_data_type=config.data_type,\n            weights_data_type=config.weights_data_type,\n            layernorm_eps=config.layernorm_eps,\n            layernorm_type=config.layernorm_type,\n            activation_type=config.activation_type,\n            has_positional_encoding=config.has_positional_encoding,\n            has_pre_decoder_layernorm=config.has_pre_decoder_layernorm,\n            has_post_decoder_layernorm=config.has_post_decoder_layernorm,\n            has_adapters=config.has_adapters,\n            adapter_inter_size=config.adapter_inter_size,\n            int8_mode=config.int8_mode,\n            **kwargs,\n        )\n\n    def load(\n        self,\n        checkpoint_path: PathLike,\n        inference_data_type: Optional[Union[str, torch.dtype]] = None,\n        config: Optional[GptInitModelParameters] = None,\n        device: Optional[Union[str, int, torch.device]] = None,\n    ):\n\n        checkpoint_path = Path(checkpoint_path)\n        device = device or comm.get_device()\n        config = config or self.config\n\n        compute_dtype = to_torch_dtype(inference_data_type or self.dtype)\n\n        self.weight = GptLayerWeights.from_config(config)\n        self.weight.load(\n            checkpoint_path, compute_dtype, config.weights_data_type, device\n        )\n\n        self.context_decoder.set_weight(self.weight)\n        self.decoder.set_weight(self.weight)\n\n        weight_dtype = to_numpy_dtype(config.weights_data_type)\n\n        def _safe_load_from_bin(param: torch.nn.Parameter, fname):\n            if (checkpoint_path / fname).exists():\n                # np_w is 1-D array since a bin file doesn't have shape info.\n                w_ = np.fromfile(checkpoint_path / fname, dtype=weight_dtype)\n                param.data = (\n                    torch.from_numpy(w_)\n                    .reshape(param.data.shape)\n                    .to(compute_dtype)\n                )\n            else:\n                raise FileNotFoundError(f\"Faile to load {fname}\")\n\n        def _safe_load_lm_head_from_bin(param, fname, ctype):\n            if (checkpoint_path / fname).exists():\n                shape = (\n                    self.vocab_size,\n                    self.config.head_num * self.config.size_per_head,\n                )\n                # np_w is 1-D array since a bin file doesn't have shape info.\n                w_ = np.fromfile(checkpoint_path / fname, dtype=weight_dtype)\n                param.data = param.data.to(ctype)\n                param.data[: self.vocab_size, :] = (\n                    torch.from_numpy(w_).reshape(shape).to(ctype)\n                )\n            else:\n                print(f\"Faile to load {fname}\")\n                torch.nn.init.normal_(param).to(compute_dtype)\n            self._mask_padded_vocab_weights(param)\n\n        # pylint:disable=line-too-long\n        if comm.is_pipeline_group_first():\n            _safe_load_lm_head_from_bin(\n                self.word_embedding.weight, \"model.wte.bin\", compute_dtype\n            )\n            self._mask_padded_vocab_weights(self.word_embedding.weight)\n            if self.position_encoding is not None:\n                _safe_load_from_bin(\n                    self.position_encoding.weight, \"model.wpe.bin\"\n                )\n            if self.pre_decoder_layernorm is not None:\n                _safe_load_from_bin(\n                    self.pre_decoder_layernorm.weight,\n                    \"model.pre_decoder_layernorm.weight.bin\",\n                )\n                _safe_load_from_bin(\n                    self.pre_decoder_layernorm.bias,\n                    \"model.pre_decoder_layernorm.bias.bin\",\n                )\n        if comm.is_pipeline_group_last():\n            if self.post_decoder_layernorm is not None:\n                _safe_load_from_bin(\n                    self.post_decoder_layernorm.weight,\n                    \"model.final_layernorm.weight.bin\",\n                )\n                _safe_load_from_bin(\n                    self.post_decoder_layernorm.bias,\n                    \"model.final_layernorm.bias.bin\",\n                )\n            if (checkpoint_path / \"model.lm_head.weight.bin\").exists():\n                _safe_load_lm_head_from_bin(\n                    self.lm_head.weight,\n                    \"model.lm_head.weight.bin\",\n                    self.lm_head_ctype,\n                )\n            else:\n                if self.use_fp32_to_compute_logit:\n                    _safe_load_lm_head_from_bin(\n                        self.lm_head.weight, \"model.wte.bin\", torch.float32\n                    )\n                else:\n                    # In this branch we can share the pre and post\n                    # decoder embeddings, but ONLY pipeline size is 1.\n                    # When pipeline size > 1, these two weights will end up on\n                    # different GPUs, so we must load the\n                    # post decoder weight again (else case).\n                    if comm.get_pipeline_para_size() == 1:\n                        self.lm_head.weight = self.word_embedding.weight\n                    else:\n                        _safe_load_lm_head_from_bin(\n                            self.lm_head.weight, \"model.wte.bin\", compute_dtype\n                        )\n\n        self.to(device)\n\n    @property\n    def dtype(self):\n        assert self.weight is not None\n        return self.weight.dtype\n\n    @property\n    def device(self):\n        assert self.weight is not None\n        return self.weight.device\n\n    def cuda(self, device=None):\n        assert torch.cuda.is_available()\n        for name, param in self._parameters.items():\n            setattr(self, name, param.cuda(device))\n        return self\n\n    def to(self, device=None):\n        for name, param in self._parameters.items():\n            setattr(self, name, param.to(device))\n        return self\n\n    def float(self):\n        for name, param in self._parameters.items():\n            setattr(self, name, param.float())\n        return self\n\n    def half(self):\n        for name, param in self._parameters.items():\n            setattr(self, name, param.half())\n        return self\n\n    def bfloat16(self):\n        for name, param in self._parameters.items():\n            setattr(self, name, param.bfloat16())\n        return self\n\n    def _mask_padded_vocab_weights(self, weight: torch.Tensor):\n        assert self.vocab_size_padded >= self.vocab_size\n        if self.vocab_size_padded > self.vocab_size:\n            weight.data[self.vocab_size :, ...] = 0  # noqa: E203\n\n    def generate_pad_mask(self, input_lengths, memory_length, init_step=0):\n        \"\"\"Generate a pad mask tensor.\n\n        # Args.\n            input_lengths: (batch_size * beam_width,), input lengths\n            memory_length: the length of key/value cache memory.\n            init_step: int, initial step.\n        # Return\n            masked_tokens: BoolTensor,\n                (batch_size * beam_width, memory_length),\n                True if init_step + input_length[i] <= j <\n                    init_step + max_input_length,\n                where i is a batch-beam index and j is a time step\n                modulo by memory_length.\n        \"\"\"\n        max_input_length = input_lengths.max()\n        input_lengths = input_lengths.unsqueeze(1)\n        shift = init_step % memory_length\n        step_indices = torch.arange(\n            init_step, init_step + memory_length, device=input_lengths.device\n        )\n        step_indices = (\n            step_indices.roll(shift)\n            .unsqueeze(0)\n            .tile(input_lengths.shape[0], 1)\n        )\n        masked_tokens = torch.logical_and(\n            step_indices >= input_lengths,\n            step_indices < init_step + max_input_length,\n        )\n        return masked_tokens\n\n    def get_local_batch_size(self, batch_size):\n        \"\"\"Get a local batch size by the same way that FT Gpt does.\"\"\"\n        local_batch_size = batch_size\n        pp_size = self.decoder.pipeline_para_size\n        if pp_size > 1:\n            if local_batch_size % pp_size == 0:\n                local_batch_size //= pp_size\n            while local_batch_size > 1024 and local_batch_size % 2 == 0:\n                local_batch_size //= 2\n        return local_batch_size\n\n    @torch.no_grad()\n    def generate(\n        self,\n        input_token_ids: torch.IntTensor,\n        input_lengths: torch.IntTensor,\n        gen_length: int,\n        eos_token_id: Optional[int] = None,\n        local_batch_size: Optional[int] = None,\n        beam_width: int = 1,\n        top_k: Optional[torch.IntTensor] = None,\n        top_p: Optional[torch.FloatTensor] = None,\n        top_p_decay: Optional[torch.FloatTensor] = None,\n        top_p_min: Optional[torch.FloatTensor] = None,\n        top_p_reset_ids: Optional[torch.IntTensor] = None,\n        temperature: Optional[torch.FloatTensor] = None,\n        repetition_penalty: Optional[torch.FloatTensor] = None,\n        presence_penalty: Optional[torch.FloatTensor] = None,\n        min_length: Optional[torch.IntTensor] = None,\n        len_penalty: Optional[torch.FloatTensor] = None,\n        beam_search_diversity_rate: Optional[torch.FloatTensor] = None,\n        stop_words_list: Optional[torch.IntTensor] = None,\n        bad_words_list: Optional[torch.IntTensor] = None,\n        sequence_limit_lengths: Optional[torch.IntTensor] = None,\n        random_seed: Optional[torch.LongTensor] = None,\n        memory_length: Optional[int] = None,\n        return_output_length: bool = False,\n        return_log_probs: bool = False,\n    ):\n        \"\"\"\n\n        # Args.\n            input_token_ids: IntTensor, (batch_size, max_input_length),\n                input hidden state to decoder.\n            input_lengths: IntTensor, (batch_size),\n                the lengths of input context sequences.\n            gen_length: int, the number of tokens to generate.\n            local_batch_size: int, optional, a batch size of\n                local iteration. (disabled)\n            eos_token_id: int, eos token id.\n            beam_width: int, number of beams for beam search.\n                If 1, sampling decode will be used.\n            top_k: IntTensor, (batch_size,) top-k sampling.\n                The number of most probable tokens to keep\n                for sampling per sentence in a batcch.\n            top_p: FloatTensor, (batch_size,), top-p sampling.\n                The cumulative probability\n                of to filter the set of most probable tokens.\n            top_p_decay: FloatTensor, (batch_size,)\n                The decay of top-p value for top_p sampling.\n            top_p_min: FloatTensor, (batch_size,)\n                The minimum top p values in top-p decaying.\n            top_p_reset_ids: IntTensor, (batch_size,)\n                reset ids for resetting top_p values for top p sampling\n            temperature: FloatTensor, (batch_size,),\n                The temperature value for smoothing the logit distribution.\n            repetition_penalty: FloatTensor, (batch_size,),\n                The repetition penalty.\n            presence_penalty: FloatTensor, (batch_size,),\n                The presence penalty, which is exclusive with\n                repetition_penalty.\n                Only one of repetition and presence penalties is allowed.\n            min_length: IntTensor, (batch_size,),\n                Minimum length for each sentences. EOS is masked if length is\n                below min.\n            len_penalty: FloatTensor, (batch_size,)\n                The exponent of the length penalty of beam scores.\n            beam_search_diversity_rate: FloatTensor, (batch_size,),\n                The diversity rate of beam search.\n            stop_words_list: IntTensor, (batch_size, 2, stop_words_length)\n                When FT generates words in this list, it will stop the\n                generation. An extension of stop id.\n            bad_words_list IntTensor, (batch_size, 2, bad_words_length)\n                The words in the list will never be sampled.\n            sequence_limit_lengths: IntTensor, (batch_size,), The maximum\n                length of a generated sequence.\n            memory_length: int, the length of cache memory. If None, it will\n                be max_input_length + gen_length.\n        # Returns\n            IntTensor, (batch_size, beam_width, max_seq_length) output\n            token ids.\n        \"\"\"\n        assert (\n            self.weight is not None\n        ), \"Please call load() first to initialize weights.\"\n\n        input_token_ids = input_token_ids.type(torch.int32).to(self.device)\n        input_lengths = input_lengths.type(torch.int32).to(self.device)\n\n        batch_size = len(input_token_ids)\n        max_input_length = input_token_ids.shape[-1]\n        max_seq_length = max_input_length + gen_length\n        memory_length = memory_length or max_seq_length\n\n        # TODO: Enable local batch later. We currently disable local batching due to # noqa: E501\n        #   an input mismatch issue of FT's decode_op: FT's decode_op requires logits # noqa: E501\n        #   of shape (batch_size, ...) but we have logits of shape (local_batch_size, ...) # noqa: E501\n        #   After fixing FT's side, we will enable local batch.\n        # local_batch_size = local_batch_size or self.get_local_batch_size(batch_size) # noqa: E501\n        # num_local_batches, last_chunk = divmod(batch_size, local_batch_size)\n        # if last_chunk > 0:\n        #     num_local_batches += 1\n        assert local_batch_size is None or local_batch_size == batch_size\n        local_batch_size = batch_size\n        num_local_batches = 1\n\n        device = self.device\n\n        eos_token_id = (\n            eos_token_id if eos_token_id is not None else self.config.end_id\n        )\n        assert (\n            eos_token_id is not None\n        ), \"eos_token-id must be specified in generation.\"\n        eos_token_ids = eos_token_id * torch.ones(\n            batch_size, dtype=torch.int32, device=device\n        )\n        assert repetition_penalty is None or presence_penalty is None, (\n            \"Found ambiguous parameters repetition_penalty and \"\n            \"presence_penalty which are mutually exclusive. \"\n            \"Please provide one of repetition_penalty and presence_penalty.\"\n        )\n        # Setup decoder_op prior to calling the forward function.\n        self.decode_op.setup(\n            batch_size,\n            beam_width,\n            top_k,\n            top_p,\n            temperature,\n            repetition_penalty,\n            presence_penalty,\n            min_length,\n            len_penalty,\n            beam_search_diversity_rate,\n            random_seed,\n            top_p_decay,\n            top_p_min,\n            top_p_reset_ids,\n        )\n\n        # Prepare input and output arguments.\n        if beam_width > 1:\n            # Tiling for beam search.\n            input_token_ids = input_token_ids.repeat(1, beam_width).view(\n                batch_size * beam_width, -1\n            )\n            input_lengths = (\n                input_lengths.view(-1, 1).repeat(1, beam_width).view(-1)\n            )\n            if sequence_limit_lengths is not None:\n                sequence_limit_lengths = (\n                    sequence_limit_lengths.view(-1, 1)\n                    .repeat(1, beam_width)\n                    .view(-1)\n                )\n            # src/tgt cache indirections.\n            cache_indirection = torch.zeros(\n                (2, batch_size, beam_width, memory_length),\n                dtype=torch.int32,\n                device=device,\n            )\n            parent_ids = torch.zeros(\n                max_seq_length,\n                batch_size * beam_width,\n                dtype=torch.int32,\n                device=device,\n            )\n        else:\n            cache_indirection = None\n            src_cache_indirection = None\n            tgt_cache_indirection = None\n            parent_ids = None\n\n        pad_lengths = max_input_length - input_lengths\n        # Since tril() doesn't support bf16 dtype,\n        # we create of bool type and then cast it to dtype.\n        attention_mask = (\n            torch.ones(\n                (max_input_length, max_input_length),\n                dtype=torch.bool,\n                device=device,\n            )\n            .tril()\n            .unsqueeze(0)\n            .tile(input_token_ids.shape[0], 1, 1)\n            .to(self.dtype)\n        )\n        for b, input_length in enumerate(input_lengths):\n            attention_mask[b, input_length:, ...] = 0\n        masked_tokens = self.generate_pad_mask(input_lengths, memory_length)\n        finished = torch.zeros_like(input_lengths).bool()\n        sequence_lengths = (max_input_length - 1) * torch.ones_like(\n            input_lengths\n        )\n\n        if return_log_probs or beam_width > 1:\n            cum_log_probs = torch.zeros(batch_size * beam_width, device=device)\n            output_log_probs = torch.zeros(\n                (gen_length, batch_size * beam_width), device=device\n            )\n        else:\n            cum_log_probs = None\n            output_log_probs = None\n\n        # Contiguous buffer for each decode_op step,\n        # it will be transposed tensor for the final output.\n        output_token_ids = torch.zeros(\n            (max_seq_length, batch_size * beam_width),\n            dtype=torch.int32,\n            device=device,\n        )\n        output_token_ids[:max_input_length, ...] = input_token_ids.T\n\n        if comm.is_pipeline_group_first():\n            # Prepare input tensors of decoder.\n            input_embeds = self.word_embedding(input_token_ids)\n            if self.position_encoding is not None:\n                position_ids = torch.arange(\n                    0, max_input_length, dtype=torch.int, device=device\n                )\n                position_ids = position_ids.unsqueeze(0).view(\n                    -1, max_input_length\n                )\n                input_embeds += self.position_encoding(position_ids)\n            if self.pre_decoder_layernorm is not None:\n                input_embeds = self.pre_decoder_layernorm(input_embeds)\n        else:\n            # Dummy input_embeds\n            input_embeds = torch.empty(\n                size=(\n                    batch_size * beam_width,\n                    max_input_length,\n                    self.context_decoder.hidden_size,\n                ),\n                dtype=self.context_decoder.dtype,\n                device=device,\n            )\n\n        use_shared_contexts = (\n            (self.shared_contexts_ratio > 0.0)\n            and (max_input_length >= 1)\n            and (batch_size > 1)\n        )\n        batch_to_compact, compact_to_batch = None, None\n        if use_shared_contexts:\n            find_context_duplications = (\n                torch.ops.fastertransformer.find_context_duplications\n            )\n            batch_to_compact, compact_to_batch = find_context_duplications(\n                input_token_ids\n            )\n            use_shared_contexts = (\n                compact_to_batch.shape[0]\n                <= self.shared_contexts_ratio * batch_size\n            )\n\n            if not use_shared_contexts:\n                batch_to_compact, compact_to_batch = None, None\n\n        profiler.start(\"ft-context-decoder\")\n        (\n            _,\n            k_cache,\n            v_cache,\n            last_token_hidden_states,\n        ) = self.context_decoder.forward(\n            input_embeds=input_embeds,\n            attention_mask=attention_mask,\n            input_lengths=input_lengths,\n            memory_length=memory_length,\n            batch_to_compact_index=batch_to_compact,\n            compact_index=compact_to_batch,\n        )\n        profiler.stop(\"ft-context-decoder\")\n\n        for step in range(max_input_length, max_seq_length):\n            src_indir_idx = (step - max_input_length) % 2\n            tgt_indir_idx = 1 - src_indir_idx\n\n            is_generation_done = torch.tensor(\n                [True], dtype=torch.bool, device=device\n            )\n            for ite in range(num_local_batches):\n                # The indices of the current local batch-beam.\n                bbidx = range(\n                    ite * local_batch_size * beam_width,\n                    min(\n                        (ite + 1) * local_batch_size * beam_width,\n                        batch_size * beam_width,\n                    ),\n                )\n                if cache_indirection is not None:\n                    bidx = range(\n                        ite * local_batch_size,\n                        min((ite + 1) * local_batch_size, batch_size),\n                    )\n                    src_cache_indirection = cache_indirection[\n                        src_indir_idx, bidx, ...\n                    ]\n                    tgt_cache_indirection = cache_indirection[\n                        tgt_indir_idx, bidx, ...\n                    ]\n\n                if step == max_input_length:\n                    hidden_states = last_token_hidden_states[bbidx, ...]\n                else:\n                    if comm.is_pipeline_group_first():\n                        input_embeds = self.word_embedding(\n                            output_token_ids[step - 1, bbidx]\n                        )\n                        if self.position_encoding is not None:\n                            position_ids = (step - 1) * torch.ones_like(\n                                pad_lengths[bbidx]\n                            )\n                            input_embeds += self.position_encoding(\n                                position_ids\n                            )\n                        if self.pre_decoder_layernorm is not None:\n                            input_embeds = self.pre_decoder_layernorm(\n                                input_embeds\n                            )\n                    else:\n                        # Dummy input_imbeds\n                        input_embeds = torch.empty(\n                            size=(len(bbidx), self.decoder.hidden_size),\n                            dtype=self.decoder.dtype,\n                            device=device,\n                        )\n\n                    profiler.start(\"ft-decoder\")\n                    hidden_states = self.decoder.forward(\n                        max_input_length=max_input_length,\n                        step=step,\n                        ite=ite,\n                        input_embeds=input_embeds,\n                        sequence_lengths=sequence_lengths[bbidx],\n                        key_cache=k_cache,\n                        value_cache=v_cache,\n                        finished=finished[bbidx],\n                        total_padding_tokens=pad_lengths[bbidx],\n                        cache_indirection=src_cache_indirection,\n                        masked_tokens=masked_tokens[bbidx, ...],\n                    )\n                    profiler.stop(\"ft-decoder\")\n\n                if comm.is_pipeline_group_last():\n                    if self.post_decoder_layernorm is not None:\n                        hidden_states = self.post_decoder_layernorm(\n                            hidden_states\n                        )\n\n                    # We use logits of fp32 type to avoid overflow issue.\n                    if self.use_fp32_to_compute_logit:\n                        # The FT GPT op internally uses FP32 compute type\n                        # for matrix multiplication.\n                        # This will produce the same result with the\n                        # end-to-end FT's GPT op.\n                        logits = torch.nn.functional.linear(\n                            hidden_states.float(), self.lm_head.weight\n                        )\n                    else:\n                        logits = self.lm_head(hidden_states).float()\n\n                    profiler.start(\"ft-decode\")\n                    should_stop = self.decode_op.forward(\n                        logits.view(batch_size, beam_width, -1),\n                        step,\n                        max_input_length,\n                        ite,\n                        local_batch_size,\n                        eos_token_ids,\n                        top_k,\n                        top_p,\n                        temperature,\n                        repetition_penalty,\n                        presence_penalty,\n                        min_length,\n                        len_penalty,\n                        beam_search_diversity_rate,\n                        top_p_decay,\n                        top_p_min,\n                        top_p_reset_ids,\n                        None,\n                        input_lengths,\n                        sequence_limit_lengths,\n                        stop_words_list,\n                        bad_words_list,\n                        src_cache_indirection,\n                        output_token_ids.view(-1, batch_size, beam_width),\n                        finished,\n                        sequence_lengths,\n                        cum_log_probs,\n                        output_log_probs,\n                        parent_ids,\n                        tgt_cache_indirection,\n                    )\n                    profiler.stop(\"ft-decode\")\n                    is_generation_done &= should_stop\n\n            # Broadcast from the last pipeline node if needed.\n            profiler.start(\"ft-bcast\")\n            tensors_to_bcast = [\n                output_token_ids[step, ...],\n                finished,\n                sequence_lengths,\n                is_generation_done,\n            ]\n            if beam_width > 1:\n                tensors_to_bcast.append(tgt_cache_indirection)\n            self.decode_op.broadcast_from_last_pipeline(tensors_to_bcast)\n            profiler.stop(\"ft-bcast\")\n\n            if is_generation_done or finished.all():\n                break\n\n        # Transpose (L, batch, beam) -> (batch, beam, L)\n        output_token_ids = output_token_ids.view(\n            -1, batch_size, beam_width\n        ).permute(1, 2, 0)\n\n        # Increase sequence_length by 1 because the sequence length of time step t is t - 1. # noqa: E501\n        sequence_lengths += 1\n\n        # Outputs\n        output_dict = dict(output_token_ids=output_token_ids)\n        if return_output_length:\n            output_dict[\"output_lengths\"] = sequence_lengths\n        if return_log_probs:\n            output_dict[\"cum_log_probs\"] = cum_log_probs\n            output_dict[\"output_log_probs\"] = output_log_probs\n        return output_dict\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/huggingface_gpt_convert.py",
    "content": "# Based on https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/utils/huggingface_gpt_convert.py # noqa: E501\n# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\"\"\"\nConvert huggingface GPT model. Use https://huggingface.co/gpt2 as demo.\n\"\"\"\n\nimport argparse\nimport configparser\nimport os\nimport sys\n\nfrom loguru import logger\nimport numpy as np\nfrom transformers import GPT2Model  # transformers-4.10.0-py3\n\nfrom nebullvm.optional_modules.torch import torch\n\ndir_path = os.path.dirname(os.path.realpath(__file__))\nsys.path.append(dir_path + \"/../../../..\")\nsys.path.append(dir_path)\n\n\ndef get_weight_data_type(data_type):\n    if data_type == \"fp32\":\n        return np.float32\n    elif data_type == \"fp16\":\n        return np.float16\n    else:\n        assert False, f\"Invalid weight data type {data_type}\"\n\n\ndef split_and_convert_process(i, saved_dir, factor, key, args, val):\n\n    if (\n        key.find(\"input_layernorm.weight\") != -1\n        or key.find(\"input_layernorm.bias\") != -1\n        or key.find(\"attention.dense.bias\") != -1\n        or key.find(\"post_attention_layernorm.weight\") != -1\n        or key.find(\"post_attention_layernorm.bias\") != -1\n        or key.find(\"mlp.dense_4h_to_h.bias\") != -1\n        or key.find(\"final_layernorm.weight\") != -1\n        or key.find(\"final_layernorm.bias\") != -1\n    ):\n\n        # shared weights, only need to convert the weights of rank 0\n        if i == 0:\n            saved_path = saved_dir + \"/model.\" + key + \".bin\"\n            val.tofile(saved_path)\n\n    elif (\n        key.find(\"attention.dense.weight\") != -1\n        or key.find(\"mlp.dense_4h_to_h.weight\") != -1\n    ):\n        split_vals = np.split(val, factor, axis=0)\n        for j in range(factor):\n            saved_path = (\n                saved_dir + \"/model.\" + key + \".%d.bin\" % (i * factor + j)\n            )\n            split_vals[j].tofile(saved_path)\n\n    elif (\n        key.find(\"mlp.dense_h_to_4h.weight\") != -1\n        or key.find(\"mlp.dense_h_to_4h.bias\") != -1\n    ):\n\n        split_vals = np.split(val, factor, axis=-1)\n        for j in range(factor):\n            saved_path = (\n                saved_dir + \"/model.\" + key + \".%d.bin\" % (i * factor + j)\n            )\n            split_vals[j].tofile(saved_path)\n\n    elif key.find(\"attention.query_key_value.bias\") != -1:\n        local_dim = (int)(val.shape[-1] / 3)\n\n        val = val.reshape(3, local_dim)\n        split_vals = np.split(val, factor, axis=-1)\n\n        for j in range(factor):\n            saved_path = (\n                saved_dir + \"/model.\" + key + \".%d.bin\" % (i * factor + j)\n            )\n            split_vals[j].tofile(saved_path)\n\n    elif key.find(\"attention.query_key_value.weight\") != -1:\n        hidden_dim = val.shape[0]\n        local_dim = (int)(val.shape[-1] / 3)\n\n        val = val.reshape(hidden_dim, 3, local_dim)\n        split_vals = np.split(val, factor, axis=-1)\n\n        for j in range(factor):\n            saved_path = (\n                saved_dir + \"/model.\" + key + \".%d.bin\" % (i * factor + j)\n            )\n            split_vals[j].tofile(saved_path)\n\n    else:\n        logger.warning(\"[ERROR] cannot find key '{}'\".format(key))\n\n\ndef split_and_convert(args):\n    torch_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n    model = GPT2Model.from_pretrained(args.in_file).to(torch_device)\n    main(\n        args.saved_dir,\n        model,\n        args.trained_gpu_num,\n        args.infer_gpu_num,\n        args.processes,\n        args.weight_data_type,\n    )\n\n\ndef main(\n    saved_dir,\n    model: GPT2Model,\n    trained_gpu_num=1,\n    infer_gpu_num=1,\n    processes=1,\n    weight_data_type=\"fp32\",\n):\n    assert isinstance(model, GPT2Model), \"model must be GPT2Model\"\n    args = None\n    saved_dir = saved_dir + \"/%d-gpu/\" % infer_gpu_num\n\n    if not os.path.exists(saved_dir):\n        os.makedirs(saved_dir)\n    # ckpt_name = args.in_file\n\n    t_gpu_num = trained_gpu_num\n    i_gpu_num = infer_gpu_num\n    assert i_gpu_num % t_gpu_num == 0\n\n    factor = (int)(i_gpu_num / t_gpu_num)\n\n    # load position_embedding from rank 0\n    # torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'\n    # model = GPT2Model.from_pretrained(args.in_file).to(torch_device)\n\n    hf_config = vars(model.config)\n\n    # NOTE: save parameters to config files (loaded by triton backends)\n    config = configparser.ConfigParser()\n    config[\"gpt\"] = {}\n    try:\n        config[\"gpt\"][\"model_name\"] = (\n            \"gpt\"\n            if hf_config[\"_name_or_path\"] == \"\"\n            else hf_config[\"_name_or_path\"]\n        )\n        config[\"gpt\"][\"head_num\"] = str(hf_config[\"n_head\"])\n        n_embd = hf_config[\"n_embd\"]\n        config[\"gpt\"][\"size_per_head\"] = str(n_embd // hf_config[\"n_head\"])\n        config[\"gpt\"][\"inter_size\"] = str(n_embd * 4)\n        config[\"gpt\"][\"max_pos_seq_len\"] = str(hf_config[\"n_positions\"])\n        config[\"gpt\"][\"num_layer\"] = str(hf_config[\"n_layer\"])\n        config[\"gpt\"][\"vocab_size\"] = str(hf_config[\"vocab_size\"])\n        config[\"gpt\"][\"start_id\"] = str(hf_config[\"bos_token_id\"])\n        config[\"gpt\"][\"end_id\"] = str(hf_config[\"eos_token_id\"])\n        config[\"gpt\"][\"weight_data_type\"] = weight_data_type\n        with open(saved_dir + \"/config.ini\", \"w\") as configfile:\n            config.write(configfile)\n    except:  # noqa: E722\n        logger.warning(\"Fail to save the config in config.ini.\")\n\n    np_weight_data_type = get_weight_data_type(weight_data_type)\n\n    huggingface_model_name_pattern = [\n        \"ln_1.bias\",\n        \"ln_1.weight\",\n        \"attn.c_attn.bias\",\n        \"attn.c_attn.weight\",\n        \"attn.c_proj.bias\",\n        \"attn.c_proj.weight\",\n        \"ln_2.bias\",\n        \"ln_2.weight\",\n        \"mlp.c_fc.bias\",\n        \"mlp.c_fc.weight\",\n        \"mlp.c_proj.bias\",\n        \"mlp.c_proj.weight\",\n    ]\n\n    ft_model_name_pattern = [\n        \"input_layernorm.bias\",\n        \"input_layernorm.weight\",\n        \"attention.query_key_value.bias\",\n        \"attention.query_key_value.weight\",\n        \"attention.dense.bias\",\n        \"attention.dense.weight\",\n        \"post_attention_layernorm.bias\",\n        \"post_attention_layernorm.weight\",\n        \"mlp.dense_h_to_4h.bias\",\n        \"mlp.dense_h_to_4h.weight\",\n        \"mlp.dense_4h_to_h.bias\",\n        \"mlp.dense_4h_to_h.weight\",\n    ]\n\n    # torch.multiprocessing.set_start_method(\"spawn\")\n    # torch.multiprocessing.set_sharing_strategy(\"file_system\")\n    # pool = multiprocessing.Pool(args.processes)\n    for name, param in model.named_parameters():\n        if name.find(\"weight\") == -1 and name.find(\"bias\") == -1:\n            continue\n        if name == \"wpe.weight\":\n            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(\n                saved_dir + \"model.wpe.bin\"\n            )\n        elif name == \"wte.weight\":\n            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(\n                saved_dir + \"model.wte.bin\"\n            )\n        elif name == \"ln_f.bias\":\n            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(\n                saved_dir + \"model.final_layernorm.bias.bin\"\n            )\n        elif name == \"ln_f.weight\":\n            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(\n                saved_dir + \"model.final_layernorm.weight.bin\"\n            )\n        elif name == \"lm_head.weight\":\n            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(\n                saved_dir + \"model.lm_head.weight.bin\"\n            )\n        else:\n            for i in range(len(huggingface_model_name_pattern)):\n                if name.find(huggingface_model_name_pattern[i]) != -1:\n                    new_name = name.replace(\"h.\", \"layers.\").replace(\n                        huggingface_model_name_pattern[i],\n                        ft_model_name_pattern[i],\n                    )\n                    # pool.starmap(split_and_convert_process,\n                    # [(0, saved_dir, factor, new_name, args,\n                    # param.detach().cpu().numpy().astype(np_weight_data_type))],\n                    # )\n                    split_and_convert_process(\n                        0,\n                        saved_dir,\n                        factor,\n                        new_name,\n                        args,\n                        param.detach()\n                        .cpu()\n                        .numpy()\n                        .astype(np_weight_data_type),\n                    )\n\n    # pool.close()\n    # pool.join()\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\n        formatter_class=argparse.RawTextHelpFormatter\n    )\n    parser.add_argument(\n        \"-saved_dir\",\n        \"-o\",\n        type=str,\n        help=\"file name of output file\",\n        required=True,\n    )\n    parser.add_argument(\n        \"-in_file\",\n        \"-i\",\n        type=str,\n        help=\"file name of input checkpoint file\",\n        required=True,\n    )\n    parser.add_argument(\n        \"-trained_gpu_num\",\n        \"-t_g\",\n        type=int,\n        help=\"How many gpus for inference\",\n        default=1,\n    )\n    parser.add_argument(\n        \"-infer_gpu_num\",\n        \"-i_g\",\n        type=int,\n        help=\"How many gpus for inference\",\n        required=True,\n    )\n    parser.add_argument(\n        \"-processes\",\n        \"-p\",\n        type=int,\n        help=\"How many processes to spawn for conversion (default: 4)\",\n        default=4,\n    )\n    parser.add_argument(\n        \"-weight_data_type\", type=str, default=\"fp32\", choices=[\"fp32\", \"fp16\"]\n    )\n\n    args = parser.parse_args()\n    logger.info(\"\\n=============== Argument ===============\")\n    for key in vars(args):\n        logger.info(\"{}: {}\".format(key, vars(args)[key]))\n    logger.info(\"========================================\")\n\n    split_and_convert(args)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/intel_neural_compressor.py",
    "content": "from pathlib import Path\nfrom typing import Union\n\nfrom nebullvm.core.models import QuantizationType\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\nfrom nebullvm.operations.optimizations.compilers.quantizations.intel_neural_compressor import (  # noqa: E501\n    quantize_neural_compressor,\n)\nfrom nebullvm.operations.optimizations.compilers.quantizations.utils import (\n    check_quantization,\n)\nfrom nebullvm.optional_modules.torch import Module\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass IntelNeuralCompressorCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [\n            QuantizationType.STATIC,\n            QuantizationType.DYNAMIC,\n        ],\n        \"gpu\": [],\n    }\n\n    def __init__(self):\n        super().__init__()\n        self.model_orig = None\n\n    def execute(\n        self,\n        model: Module,\n        input_tfms: MultiStageTransformation = None,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Compile the input model using IntelNeuralCompressor library.\n\n        Args:\n            model (torch.nn.Module): The pytorch model.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n        train_input_data = input_data.get_split(\"train\")\n\n        self.model_orig = model\n\n        if quantization_type is not None:\n            quantized_model = self._quantize_model(\n                model, quantization_type, input_tfms, train_input_data\n            )\n            self.compiled_model = self._compile_model(quantized_model)\n\n    def _compile_model(self, model: Union[str, Path]):\n        return model\n\n    @staticmethod\n    def _quantize_model(\n        model: Module,\n        quantization_type: QuantizationType,\n        input_tfms: MultiStageTransformation,\n        input_data: DataManager,\n    ):\n        return quantize_neural_compressor(\n            model, quantization_type, input_tfms, input_data\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/onnxruntime.py",
    "content": "from pathlib import Path\nfrom typing import Union, List, Tuple\n\nimport numpy as np\n\nfrom nebullvm.config import QUANTIZATION_DATA_NUM\nfrom nebullvm.core.models import QuantizationType\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\n\nfrom nebullvm.operations.optimizations.compilers.quantizations.onnx import (\n    quantize_onnx,\n)\nfrom nebullvm.operations.optimizations.compilers.quantizations.utils import (\n    check_quantization,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass ONNXCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [\n            None,\n            QuantizationType.STATIC,\n            QuantizationType.DYNAMIC,\n        ],\n        \"gpu\": [\n            None,\n            QuantizationType.HALF,\n        ],\n    }\n\n    def execute(\n        self,\n        model: str,\n        input_tfms: MultiStageTransformation = None,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Compile the input model using ONNX Runtime Compiler.\n\n        Args:\n            model (str): The onnx model path.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n        train_input_data = input_data.get_split(\"train\").get_numpy_list(\n            QUANTIZATION_DATA_NUM\n        )\n\n        if quantization_type is not None:\n            model = self._quantize_model(\n                model, train_input_data, quantization_type, input_tfms\n            )\n\n        self.compiled_model = self._compile_model(model)\n\n    def _compile_model(self, model: Union[str, Path]):\n        return model\n\n    def _quantize_model(\n        self,\n        model_path: str,\n        input_data: List[Tuple[np.ndarray, ...]],\n        quantization_type: QuantizationType,\n        input_tfms: MultiStageTransformation,\n    ):\n        return quantize_onnx(\n            model_path, input_data, quantization_type, self.device, input_tfms\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/openvino.py",
    "content": "import subprocess\nfrom pathlib import Path\nfrom typing import Tuple, List, Union\n\nimport numpy as np\n\nfrom nebullvm.config import QUANTIZATION_DATA_NUM\nfrom nebullvm.core.models import QuantizationType, ModelParams\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\nfrom nebullvm.operations.optimizations.compilers.quantizations.openvino import (  # noqa: E501\n    quantize_openvino,\n)\nfrom nebullvm.operations.optimizations.compilers.quantizations.utils import (\n    check_quantization,\n)\nfrom nebullvm.optional_modules.openvino import (\n    Core,\n    CompiledModel,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.onnx import get_input_names\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass OpenVINOCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [\n            None,\n            QuantizationType.STATIC,\n            QuantizationType.HALF,\n        ],\n        \"gpu\": [],\n    }\n\n    def __init__(self):\n        super().__init__()\n\n    def execute(\n        self,\n        model: Union[str, Path],\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation = None,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Compile the input model using OpenVINO library.\n\n        Args:\n            model (str): The onnx model path.\n            model_params (ModelParams): The model parameters.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n        train_input_data = input_data.get_split(\"train\").get_numpy_list(\n            QUANTIZATION_DATA_NUM\n        )\n\n        cmd = [\n            \"mo\",\n            \"--input_model\",\n            str(model),\n            \"--output_dir\",\n            str(Path(model).parent),\n            \"--input\",\n            \",\".join(get_input_names(str(model))),\n            \"--input_shape\",\n            \",\".join([f\"{list(shape)}\" for shape in model_params.input_sizes]),\n        ]\n\n        if quantization_type is QuantizationType.DYNAMIC:\n            return None\n\n        if quantization_type is QuantizationType.HALF:\n            cmd = cmd + [\"--compress_to_fp16\"]\n\n        process = subprocess.Popen(cmd)\n        process.wait()\n        base_path = Path(model).parent\n        openvino_model_path = base_path / f\"{Path(model).stem}.xml\"\n        openvino_model_weights = base_path / f\"{Path(model).stem}.bin\"\n\n        if quantization_type not in [QuantizationType.HALF, None]:\n            openvino_model_path, openvino_model_weights = self._quantize_model(\n                model_topology=str(openvino_model_path),\n                model_weights=str(openvino_model_weights),\n                input_names=get_input_names(str(model)),\n                input_data=train_input_data,\n            )\n\n        self.compiled_model = str(\n            Path(openvino_model_path).parent / Path(openvino_model_path).stem\n        )\n\n    def _compile_model(\n        self,\n        model_name: str,\n        model_weights: str,\n        network_parameters: ModelParams,\n    ) -> CompiledModel:\n        core = Core()\n        model = core.read_model(model=model_name, weights=model_weights)\n\n        dynamic_shape = self._get_dynamic_shape(model, network_parameters)\n\n        if dynamic_shape is not None:\n            model.reshape(dynamic_shape)\n\n        return core.compile_model(model=model, device_name=\"CPU\")\n\n    @staticmethod\n    def _quantize_model(\n        model_topology: str,\n        model_weights: str,\n        input_data: List[Tuple[np.ndarray, ...]],\n        input_names: List[str],\n    ) -> Tuple[str, str]:\n        return quantize_openvino(\n            model_topology, model_weights, input_data, input_names\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/intel_neural_compressor.py",
    "content": "from pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Any\n\nimport yaml\n\nfrom nebullvm.core.models import QuantizationType\nfrom nebullvm.optional_modules.neural_compressor import (\n    MixedPrecision,\n    Quantization,\n)\nfrom nebullvm.optional_modules.torch import DataLoader, Module, GraphModule\nfrom nebullvm.tools.data import DataManager, PytorchDataset\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n    HalfPrecisionTransformation,\n)\n\n\ndef _prepare_quantization_config(model: Any, tmp_dir: str, approach: str):\n    config = {\n        \"model\": {\n            \"name\": model.__class__.__name__,\n            \"framework\": \"pytorch_fx\",\n        },\n        \"quantization\": {\"approach\": approach},\n        \"evaluation\": {\"accuracy\": {\"metric\": {\"topk\": 1}}},\n        \"tuning\": {\n            \"accuracy_criterion\": {\"relative\": 0.01},\n        },\n    }\n\n    path_file = Path(tmp_dir) / \"temp_qt.yaml\"\n    with open(path_file, \"w\") as f:\n        yaml.dump(config, f)\n\n    return path_file\n\n\ndef _prepare_mixed_precision_config(model: Any, tmp_dir: str):\n    config = {\n        \"model\": {\n            \"name\": model.__class__.__name__,\n            \"framework\": \"pytorch_fx\",\n        },\n        \"mixed_precision\": {\"precisions\": \"bf16\"},\n        \"evaluation\": {\"accuracy\": {\"metric\": {\"topk\": 1}}},\n        \"tuning\": {\n            \"accuracy_criterion\": {\"relative\": 0.01},\n        },\n    }\n\n    path_file = Path(tmp_dir) / \"temp_mp.yaml\"\n    with open(path_file, \"w\") as f:\n        yaml.dump(config, f)\n\n    return path_file\n\n\ndef _get_dataloader(input_data: DataManager):\n    bs = input_data[0][0][0].shape[0]\n    ds = PytorchDataset(input_data, has_labels=True)\n    dl = DataLoader(ds, bs)\n    return dl\n\n\ndef _quantize_static(model: Module, input_data: DataManager) -> GraphModule:\n    with TemporaryDirectory() as tmp_dir:\n        config_file_qt = _prepare_quantization_config(\n            model, tmp_dir, \"post_training_static_quant\"\n        )\n        quantizer = Quantization(str(config_file_qt))\n        quantizer.model = model\n        quantizer.calib_dataloader = _get_dataloader(input_data)\n        quantizer.eval_dataloader = _get_dataloader(input_data)\n        compressed_model = quantizer()\n\n    return compressed_model\n\n\ndef _quantize_dynamic(model: Module) -> GraphModule:\n    with TemporaryDirectory() as tmp_dir:\n        config_file_qt = _prepare_quantization_config(\n            model, tmp_dir, \"post_training_dynamic_quant\"\n        )\n        quantizer = Quantization(str(config_file_qt))\n        quantizer.model = model\n        compressed_model = quantizer()\n\n    return compressed_model\n\n\ndef _mixed_precision(\n    model: Module, input_tfms: MultiStageTransformation\n) -> GraphModule:\n    with TemporaryDirectory() as tmp_dir:\n        config_file_qt = _prepare_mixed_precision_config(model, tmp_dir)\n        converter = MixedPrecision(str(config_file_qt))\n        converter.model = model\n        compressed_model = converter()\n\n    input_tfms.append(HalfPrecisionTransformation())\n\n    return compressed_model\n\n\ndef quantize_neural_compressor(\n    model: Module,\n    quantization_type: QuantizationType,\n    input_tfms: MultiStageTransformation,\n    input_data: DataManager,\n) -> GraphModule:\n    if quantization_type is QuantizationType.STATIC:\n        quantized_model = _quantize_static(model, input_data)\n    elif quantization_type is QuantizationType.DYNAMIC:\n        quantized_model = _quantize_dynamic(model)\n    elif quantization_type is QuantizationType.HALF:\n        quantized_model = _mixed_precision(model, input_tfms)\n    else:\n        raise ValueError(\n            f\"Quantization type {quantization_type} is not \"\n            f\"supported by Intel Neural Compressor\"\n        )\n    return quantized_model\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/onnx.py",
    "content": "from pathlib import Path\nfrom typing import Union, Iterable, Tuple, List\n\nimport cpuinfo\nimport numpy as np\n\nfrom nebullvm.core.models import QuantizationType, Device, DeviceType\nfrom nebullvm.optional_modules.onnx import (\n    onnx,\n    convert_float_to_float16_model_path,\n)\nfrom nebullvm.optional_modules.onnxruntime import (\n    CalibrationDataReader,\n    QuantType,\n    quantize_dynamic,\n    quantize_static,\n)\nfrom nebullvm.optional_modules.torch import DataLoader\nfrom nebullvm.tools.onnx import get_input_names\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n    HalfPrecisionTransformation,\n)\n\n\nclass _IterableCalibrationDataReader(CalibrationDataReader):\n    def __init__(\n        self,\n        iterable_dataset: Union[Iterable[Tuple], List[Tuple]],\n        input_names: List[str],\n    ):\n        self.iterable_dataset = iter(\n            [\n                {\n                    input_name: value\n                    for inputs in iterable_dataset\n                    for input_name, value in zip(input_names, inputs)\n                }\n            ]\n        )\n\n    def get_next(self) -> dict:\n        return next(self.iterable_dataset, None)\n\n    @classmethod\n    def from_dataloader(\n        cls, dl: DataLoader, input_names: List[str], contains_y: bool = True\n    ):\n        iterable_ds = iter(\n            inputs[:-1] if contains_y else inputs for inputs in dl\n        )\n        return cls(iterable_ds, input_names)\n\n\ndef _quantize_dynamic(model_path: str) -> str:\n    model_path = Path(model_path)\n    model_quant = model_path.parent.parent / \"int8_dynamic\"\n    model_quant.mkdir(parents=True, exist_ok=True)\n    model_quant = model_quant / (model_path.stem + \".quant.onnx\")\n    quantize_dynamic(\n        model_path,\n        model_quant,\n        weight_type=QuantType.QUInt8,\n        optimize_model=False,\n    )\n    return str(model_quant)\n\n\ndef _get_quantization_type_for_static(use_gpu) -> Tuple[QuantType, QuantType]:\n    \"\"\"Returns the quantization types for activations and weights,\n    depending on the underlying hardware\n    \"\"\"\n    arch = cpuinfo.get_cpu_info()[\"arch\"].lower()\n    if use_gpu:\n        activation_type = weight_type = QuantType.QInt8\n    elif \"x86\" in arch:\n        cpu_raw_data = cpuinfo.get_cpu_info()[\"brand_raw\"].lower()\n        if \"intel\" in cpu_raw_data and \"xeon\" in cpu_raw_data:\n            activation_type = QuantType.QUInt8\n            weight_type = QuantType.QInt8\n        else:\n            activation_type = weight_type = QuantType.QUInt8\n    else:\n        activation_type = QuantType.QUInt8\n        weight_type = QuantType.QUInt8\n    return activation_type, weight_type\n\n\ndef _quantize_static(\n    model_path: str, input_data: List[Tuple[np.ndarray, ...]], use_gpu: bool\n) -> str:\n    model_path = Path(model_path)\n    model_quant = model_path.parent.parent / \"int8_static\"\n    model_quant.mkdir(parents=True, exist_ok=True)\n    model_quant = model_quant / (model_path.stem + \".quant.onnx\")\n    inputs = input_data\n    input_names = get_input_names(str(model_path))\n    cdr = _IterableCalibrationDataReader(\n        input_names=input_names, iterable_dataset=inputs\n    )\n    activation_type, weight_type = _get_quantization_type_for_static(use_gpu)\n    quantize_static(\n        model_path,\n        Path(model_quant),\n        cdr,\n        activation_type=activation_type,\n        weight_type=weight_type,\n        optimize_model=False,\n    )\n    return str(model_quant)\n\n\ndef _convert_to_half_precision(\n    model_path: str, input_tfms: MultiStageTransformation\n) -> str:\n    model_path = Path(model_path)\n    model_quant = model_path.parent.parent / \"fp16\"\n    model_quant.mkdir(parents=True)\n    model_quant = model_quant / (model_path.stem + \"_fp16.onnx\")\n    new_onnx_model = convert_float_to_float16_model_path(str(model_path))\n    input_tfms.append(HalfPrecisionTransformation())\n    try:\n        onnx.save(new_onnx_model, str(model_quant))\n    except ValueError:\n        # Model larger than 2GB must be saved as external data\n        onnx.save(\n            new_onnx_model,\n            str(model_quant),\n            save_as_external_data=True,\n            all_tensors_to_one_file=False,\n            convert_attribute=True,\n        )\n    return str(model_quant)\n\n\ndef quantize_onnx(\n    model_path: str,\n    input_data: List[Tuple[np.ndarray, ...]],\n    quantization_type: QuantizationType,\n    device: Device,\n    input_tfms: MultiStageTransformation,\n) -> str:\n    if quantization_type == QuantizationType.DYNAMIC:\n        return _quantize_dynamic(model_path)\n    elif quantization_type == QuantizationType.STATIC:\n        return _quantize_static(\n            model_path, input_data, device.type is DeviceType.GPU\n        )\n    elif quantization_type == QuantizationType.HALF:\n        return _convert_to_half_precision(model_path, input_tfms)\n    else:\n        raise ValueError(\n            f\"Quantization type {quantization_type} not supported\"\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/openvino.py",
    "content": "from typing import List, Tuple, Any\n\nimport numpy as np\n\nfrom nebullvm.optional_modules.openvino import (\n    DataLoader,\n    load_model,\n    IEEngine,\n    create_pipeline,\n    compress_model_weights,\n    save_model,\n)\n\n\nclass _CalibrationDataLoader(DataLoader):\n    def __init__(\n        self, input_data: List[Tuple[Any, ...]], input_names: List[str]\n    ):\n        self._input_data = input_data\n        self._input_names = input_names\n\n    def __len__(self):\n        return len(self._input_data)\n\n    def __getitem__(self, item):\n        inputs = {\n            k: v for (k, v) in zip(self._input_names, self._input_data[item])\n        }\n        return (\n            (item, None),\n            inputs,\n        )\n\n\ndef quantize_openvino(\n    model_topology: str,\n    model_weights: str,\n    input_data: List[Tuple[np.ndarray, ...]],\n    input_names: List[str],\n) -> Tuple[str, str]:\n    model_config = {\n        \"model_name\": \"model\",\n        \"model\": model_topology,\n        \"weights\": model_weights,\n    }\n\n    # Engine config\n    engine_config = {\"device\": \"CPU\"}\n\n    algorithms = [\n        {\n            \"name\": \"DefaultQuantization\",\n            \"params\": {\n                \"target_device\": \"ANY\",\n                \"preset\": \"performance\",\n                \"stat_subset_size\": len(input_data),\n            },\n        }\n    ]\n    data_loader = _CalibrationDataLoader(\n        input_data=input_data, input_names=input_names\n    )\n    model = load_model(model_config=model_config)\n    engine = IEEngine(config=engine_config, data_loader=data_loader)\n    pipeline = create_pipeline(algorithms, engine)\n    compressed_model = pipeline.run(model=model)\n    compress_model_weights(compressed_model)\n    compressed_model_paths = save_model(\n        model=compressed_model,\n        save_path=\"quantized_model\",\n        model_name=\"quantized_model\",\n    )\n\n    return (\n        compressed_model_paths[0][\"model\"],\n        compressed_model_paths[0][\"weights\"],\n    )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/pytorch.py",
    "content": "import copy\nfrom typing import List, Tuple, Union\n\nfrom loguru import logger\n\nfrom nebullvm.core.models import DeviceType, Device, QuantizationType\nfrom nebullvm.optional_modules.torch import (\n    torch,\n    Module,\n    symbolic_trace,\n    QuantStub,\n    DeQuantStub,\n    GraphModule,\n    default_dynamic_qconfig,\n    prepare_fx,\n    convert_fx,\n    ScriptModule,\n)\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n    HalfPrecisionTransformation,\n)\nfrom nebullvm.tools.utils import check_module_version\n\n\nclass _QuantWrapper(Module):\n    def __init__(self, model: Module):\n        super(_QuantWrapper, self).__init__()\n        qconfig = model.qconfig if hasattr(model, \"qconfig\") else None\n        self.quant = QuantStub(qconfig)\n        self.model = model\n        self.dequant = DeQuantStub()\n\n    def forward(self, *inputs: torch.Tensor):\n        inputs = (self.quant(x) for x in inputs)\n        outputs = self.model(*inputs)\n        return tuple(self.dequant(x) for x in outputs)\n\n\ndef _quantize_dynamic_torch(model: Module):\n    layer_types = {\n        type(layer)\n        for layer in model.children()\n        if len(list(layer.parameters())) > 0\n    }\n    return torch.quantization.quantize_dynamic(\n        model=model, qconfig_spec=layer_types, dtype=torch.qint8\n    )\n\n\ndef _quantize_dynamic_torch_fx(\n    model: GraphModule,\n    input_data: List[Tuple[torch.Tensor, ...]],\n):\n    qconfig_dict = {\"\": default_dynamic_qconfig}\n\n    additional_arguments = {}\n    if check_module_version(torch, min_version=\"1.13.0\"):\n        additional_arguments[\"example_inputs\"] = input_data[0]\n\n    model_prepared = prepare_fx(model, qconfig_dict, **additional_arguments)\n    return convert_fx(model_prepared)\n\n\ndef _quantize_static_torch(\n    model: Module,\n    input_data: List[Tuple[torch.Tensor, ...]],\n    backend: str,\n):\n    model = _QuantWrapper(model)\n    model.qconfig = torch.quantization.get_default_qconfig(backend)\n    # TODO: change line below, it's wrong\n    # model = torch.quantization.fuse_modules(model, [[\"conv\", \"relu\"]])\n    model = torch.quantization.prepare(model)\n    with torch.no_grad():\n        for tensors in input_data:\n            _ = model(*tensors)\n    return torch.quantization.convert(model)\n\n\ndef _quantize_static_torch_fx(\n    model: GraphModule,\n    input_data: List[Tuple[torch.Tensor, ...]],\n    backend: str,\n):\n    qconfig_dict = {\"\": torch.quantization.get_default_qconfig(backend)}\n    additional_arguments = {}\n    if check_module_version(torch, min_version=\"1.13.0\"):\n        additional_arguments[\"example_inputs\"] = input_data[0]\n\n    model_prepared = prepare_fx(model, qconfig_dict, **additional_arguments)\n    with torch.no_grad():\n        for tensors in input_data:\n            _ = model_prepared(*tensors)\n    return convert_fx(model_prepared)\n\n\ndef _quantize_static(\n    model: Union[Module, GraphModule],\n    input_data: List[Tuple[torch.Tensor, ...]],\n    device: Device,\n):\n    assert (\n        device is not DeviceType.GPU\n    ), \"Quantization for torch is only available on CPU\"\n\n    backend = (\n        \"fbgemm\"\n        if \"fbgemm\" in torch.backends.quantized.supported_engines\n        else \"qnnpack\"\n    )\n\n    torch.backends.quantized.engine = backend\n\n    if isinstance(model, GraphModule):\n        return _quantize_static_torch_fx(model, input_data, backend)\n    else:\n        return _quantize_static_torch(model, input_data, backend)\n\n\ndef _quantize_dynamic(\n    model: Union[Module, GraphModule],\n    input_data: List[Tuple[torch.Tensor, ...]],\n    device: Device,\n):\n    assert (\n        device is not DeviceType.GPU\n    ), \"Quantization for torch is only available on CPU\"\n\n    backend = (\n        \"fbgemm\"\n        if \"fbgemm\" in torch.backends.quantized.supported_engines\n        else \"qnnpack\"\n    )\n\n    torch.backends.quantized.engine = backend\n\n    if isinstance(model, GraphModule):\n        return _quantize_dynamic_torch_fx(model, input_data)\n    else:\n        return _quantize_dynamic_torch(model)\n\n\ndef _half_precision(model: Module):\n    return model.half()\n\n\ndef quantize_pytorch(\n    model: Module,\n    quantization_type: QuantizationType,\n    input_tfms: MultiStageTransformation,\n    input_data_torch: List[Tuple[torch.Tensor, ...]],\n    device: Device,\n) -> Union[torch.nn.Module, ScriptModule, GraphModule]:\n    model = copy.deepcopy(model).eval()\n\n    try:\n        model = symbolic_trace(model)\n    except Exception:\n        logger.warning(\"Unable to trace model with torch.fx\")\n\n    if quantization_type is QuantizationType.HALF:\n        input_tfms.append(HalfPrecisionTransformation())\n        quantized_model = _half_precision(model)\n    elif quantization_type is QuantizationType.STATIC:\n        quantized_model = _quantize_static(model, input_data_torch, device)\n    elif quantization_type is QuantizationType.DYNAMIC:\n        quantized_model = _quantize_dynamic(model, input_data_torch, device)\n    else:\n        raise NotImplementedError(\n            f\"No quantization implemented for quantization \"\n            f\"type {quantization_type}\"\n        )\n\n    return quantized_model\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tensor_rt.py",
    "content": "from typing import List, Tuple\n\nimport numpy as np\n\nfrom nebullvm.core.models import QuantizationType, ModelParams\nfrom nebullvm.optional_modules.tensor_rt import (\n    tensorrt as trt,\n    IInt8EntropyCalibrator2,\n    polygraphy,\n)\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n)\n\n\ndef quantize_tensorrt(\n    quantization_type: QuantizationType,\n    model_params: ModelParams,\n    config,\n    input_tfms: MultiStageTransformation,\n    input_data: List[Tuple[np.ndarray, ...]] = None,\n):\n    if quantization_type is QuantizationType.HALF:\n        config.set_flag(trt.BuilderFlag.FP16)\n        # Tensor RT does not need to transform input data\n        # to fp16 because it expects always fp32\n    elif quantization_type is QuantizationType.STATIC:\n        assert input_data is not None, (\n            \"You need to specify the calibration data for \"\n            \"performing static quantization.\"\n        )\n        calibrator = TensorRTCalibrator(\n            batch_size=model_params.batch_size,\n            input_data=input_data,\n        )\n        config.set_flag(trt.BuilderFlag.INT8)\n        config.int8_calibrator = calibrator\n\n    return config\n\n\nclass TensorRTCalibrator(IInt8EntropyCalibrator2):\n    def __init__(\n        self, batch_size: int, input_data: List[Tuple[np.ndarray, ...]]\n    ):\n        super(TensorRTCalibrator, self).__init__()\n        self._bs = batch_size\n        self.batches = (x for x in input_data)\n\n    def get_batch(self, names):\n        cuda_stream = polygraphy.Stream()\n        try:\n            data = next(self.batches)\n\n            cuda_data = []\n            for input_tensor in data:\n                device_array = polygraphy.DeviceArray(\n                    shape=input_tensor.shape, dtype=input_tensor.dtype\n                )\n                device_array.copy_from(\n                    host_buffer=input_tensor, stream=cuda_stream\n                )\n                cuda_data.append(device_array)\n\n            return [input_tensor.ptr for input_tensor in cuda_data]\n        except StopIteration:\n            return None\n\n    def get_batch_size(self):\n        return self._bs\n\n    def read_calibration_cache(self):\n        return None\n\n    def write_calibration_cache(self, cache):\n        return None\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tensorflow.py",
    "content": "from typing import List, Tuple\n\nfrom nebullvm.core.models import QuantizationType\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\n\n\ndef _quantize_dynamic(model: tf.Module):\n    converter = tf.lite.TFLiteConverter.from_keras_model(model)\n    converter.optimizations = [tf.lite.Optimize.DEFAULT]\n    tflite_quant_model = converter.convert()\n    return tflite_quant_model\n\n\ndef _quantize_static(model: tf.Module, dataset: List[Tuple[tf.Tensor, ...]]):\n    def representative_dataset():\n        for data_tuple in dataset:\n            yield list(data_tuple)\n\n    converter = tf.lite.TFLiteConverter.from_keras_model(model)\n    converter.optimizations = [tf.lite.Optimize.DEFAULT]\n    converter.representative_dataset = representative_dataset\n    tflite_quant_model = converter.convert()\n    return tflite_quant_model\n\n\ndef _half_precision(model: tf.Module):\n    converter = tf.lite.TFLiteConverter.from_keras_model(model)\n    converter.optimizations = [tf.lite.Optimize.DEFAULT]\n    converter.target_spec.supported_types = [tf.float16]\n    tflite_quant_model = converter.convert()\n    return tflite_quant_model\n\n\ndef quantize_tensorflow(\n    model: tf.Module,\n    quantization_type: QuantizationType,\n    input_data_tensorflow: List[Tuple[tf.Tensor, ...]],\n):\n    if quantization_type is QuantizationType.DYNAMIC:\n        quantized_model = _quantize_dynamic(model)\n    elif quantization_type is QuantizationType.STATIC:\n        quantized_model = _quantize_static(model, input_data_tensorflow)\n    elif quantization_type is QuantizationType.HALF:\n        quantized_model = _half_precision(model)\n    else:\n        raise NotImplementedError(\n            f\"Quantization not supported for type {quantization_type}\"\n        )\n\n    return quantized_model\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tvm.py",
    "content": "from typing import List, Sequence, Any\n\nfrom nebullvm.config import QUANTIZATION_DATA_NUM\nfrom nebullvm.core.models import QuantizationType\nfrom nebullvm.optional_modules.tvm import (\n    relay,\n    ToMixedPrecision,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n    HalfPrecisionTransformation,\n)\n\n\nclass TVMCalibrator(DataManager):\n    def __init__(self, data_reader: Sequence, input_names: List[str]):\n        super(TVMCalibrator, self).__init__(data_reader=data_reader)\n        self._input_names = input_names\n\n    def __getitem__(self, item: int):\n        tuple_ = self._data_reader[item]\n        return {name: data for name, data in zip(self._input_names, tuple_)}\n\n\ndef quantize_apache_tvm(\n    model: Any,\n    quantization_type: QuantizationType,\n    input_tfms: MultiStageTransformation,\n    input_data: DataManager,\n    params: Any,\n):\n    if quantization_type is not None:\n        if quantization_type is QuantizationType.HALF:\n            quantized_model = ToMixedPrecision(mixed_precision_type=\"float16\")(\n                model\n            )\n            input_tfms.append(HalfPrecisionTransformation())\n        else:\n            if quantization_type is QuantizationType.DYNAMIC:\n                inputs = None\n            elif quantization_type is QuantizationType.STATIC:\n                inputs = input_data.get_split(\"train\").get_numpy_list(\n                    QUANTIZATION_DATA_NUM\n                )\n                input_names = [f\"input_{n}\" for n in range(len(inputs[0]))]\n                inputs = TVMCalibrator(inputs, input_names)\n            else:\n                return\n\n            if inputs is not None:\n                with relay.quantize.qconfig(\n                    calibrate_mode=\"kl_divergence\", weight_scale=\"max\"\n                ):\n                    quantized_model = relay.quantize.quantize(\n                        model, params, dataset=inputs\n                    )\n            else:\n                with relay.quantize.qconfig(\n                    calibrate_mode=\"global_scale\", global_scale=8.0\n                ):\n                    quantized_model = relay.quantize.quantize(model, params)\n\n        return quantized_model\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/utils.py",
    "content": "from loguru import logger\n\nfrom nebullvm.core.models import QuantizationType\n\n\ndef check_quantization(\n    quantization_type: QuantizationType, perf_loss_ths: float\n):\n    if quantization_type is not None and perf_loss_ths is None:\n        logger.warning(\n            \"Got a valid quantization type without any given quantization \"\n            \"threshold. The quantization step will be ignored.\"\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/tensor_rt.py",
    "content": "import abc\nimport copy\nimport os\nimport subprocess\nfrom pathlib import Path\nfrom typing import List, Any, Tuple\n\nimport numpy as np\n\nfrom nebullvm.config import QUANTIZATION_DATA_NUM, TORCH_TENSORRT_PRECISIONS\nfrom nebullvm.core.models import QuantizationType, ModelParams\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\n\nfrom nebullvm.operations.optimizations.compilers.quantizations.tensor_rt import (  # noqa: E501\n    quantize_tensorrt,\n)\nfrom nebullvm.operations.optimizations.compilers.quantizations.utils import (\n    check_quantization,\n)\nfrom nebullvm.optional_modules.onnx import onnx\nfrom nebullvm.optional_modules.tensor_rt import tensorrt as trt\nfrom nebullvm.optional_modules.torch import torch, Module\nfrom nebullvm.optional_modules.torch_tensorrt import (\n    torch_tensorrt,\n    DataLoaderCalibrator,\n)\nfrom nebullvm.tools.data import DataManager, PytorchDataset\nfrom nebullvm.tools.diffusers import UNet\nfrom nebullvm.tools.onnx import get_input_names\nfrom nebullvm.tools.transformations import (\n    MultiStageTransformation,\n    HalfPrecisionTransformation,\n)\n\n\nclass TensorRTCompiler(Compiler, abc.ABC):\n    supported_ops = {\n        \"cpu\": [],\n        \"gpu\": [\n            None,\n            QuantizationType.STATIC,\n            QuantizationType.HALF,\n        ],\n    }\n\n    def __init__(self):\n        super().__init__()\n        self.model_orig = None\n\n    @staticmethod\n    def _extract_dynamic_shape_ranges(model_params: ModelParams):\n        inputs_shapes = []\n\n        for i, info in enumerate(model_params.input_infos):\n            static_shape = info.size\n\n            if model_params.dynamic_info is not None:\n                input_dict = model_params.dynamic_info.inputs[i]\n\n                assert all(\n                    key in dim\n                    for dim in input_dict.values()\n                    for key in [\"min_val\", \"opt_val\", \"max_val\"]\n                ), (\n                    \"Missing min/opt/max ranges, TensorRT needs them to \"\n                    \"enable dynamic shape properly\"\n                )\n\n                shape_dict = {\n                    \"min_shape\": [\n                        static_shape[j]\n                        if j not in input_dict\n                        else input_dict[j][\"min_val\"]\n                        for j in range(len(static_shape))\n                    ],\n                    \"opt_shape\": [\n                        static_shape[j]\n                        if j not in input_dict\n                        else input_dict[j][\"opt_val\"]\n                        for j in range(len(static_shape))\n                    ],\n                    \"max_shape\": [\n                        static_shape[j]\n                        if j not in input_dict\n                        else input_dict[j][\"max_val\"]\n                        for j in range(len(static_shape))\n                    ],\n                }\n                inputs_shapes.append(shape_dict)\n            else:\n                inputs_shapes.append({\"shape\": static_shape})\n\n        return inputs_shapes\n\n    @abc.abstractmethod\n    def execute(self, *args, **kwargs):\n        pass\n\n\nclass PyTorchTensorRTCompiler(TensorRTCompiler):\n    def execute(\n        self,\n        model: Module,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation = None,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Compile the input model using TensorRT Compiler from the\n            PyTorch interface.\n\n        Args:\n            model (torch.nn.Module): The pytorch model.\n            model_params (ModelParams): The model parameters.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n\n        if quantization_type is QuantizationType.HALF:\n            dtype = torch.half\n            input_tfms.append(HalfPrecisionTransformation())\n        elif quantization_type is QuantizationType.STATIC:\n            if model_params.dynamic_info is not None:\n                self.logger.warning(\n                    \"Static quantization is not available when \"\n                    \"using dynamic shape\"\n                )\n                return\n            dtype = torch.int8\n\n            dataset = PytorchDataset(input_data.get_split(\"train\"))\n            dataloader = torch.utils.data.DataLoader(\n                dataset,\n                batch_size=dataset.batch_size,\n                shuffle=False,\n                num_workers=0,\n            )\n\n            calibrator = torch_tensorrt.ptq.DataLoaderCalibrator(\n                dataloader,\n                use_cache=False,\n                algo_type=torch_tensorrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2,  # noqa E501\n                device=torch.device(self.device.to_torch_format()),\n            )\n        else:\n            dtype = torch.float32\n\n        # Convert int64 to int32 for transformers inputs\n        input_tensors = [\n            tensor.to(self.device.to_torch_format())\n            if tensor.dtype != torch.int64\n            else tensor.to(torch.int32).to(self.device.to_torch_format())\n            for tensor in input_data.get_list(1)[0]\n        ]\n\n        self.compiled_model = self._compile_model(\n            model=model,\n            model_params=model_params,\n            input_tensors=input_tensors,\n            dtype=dtype,\n            calibrator=calibrator\n            if quantization_type is QuantizationType.STATIC\n            else None,  # noqa E501\n            quantization_type=quantization_type,\n        )\n\n    @torch.no_grad()\n    def _compile_model(\n        self,\n        model: Module,\n        model_params: ModelParams,\n        input_tensors: List[torch.Tensor],\n        dtype: torch.dtype,\n        calibrator: DataLoaderCalibrator,\n        quantization_type: QuantizationType,\n    ):\n\n        model.to(self.device.to_torch_format()).eval()\n\n        try:\n            if quantization_type is QuantizationType.HALF:\n                ts_model = torch.jit.script(copy.deepcopy(model).half()).half()\n            else:\n                ts_model = torch.jit.script(model)\n        except Exception:\n            if quantization_type is QuantizationType.HALF:\n                ts_model = torch.jit.trace(\n                    copy.deepcopy(model).half(),\n                    [t.half() for t in input_tensors],\n                ).half()\n            else:\n                ts_model = torch.jit.trace(model, input_tensors)\n\n        with torch_tensorrt.logging.errors():\n            inputs_shapes = self._extract_dynamic_shape_ranges(model_params)\n            trt_model = torch_tensorrt.compile(\n                ts_model,\n                inputs=[\n                    torch_tensorrt.Input(\n                        **inputs_shapes[i],\n                        dtype=torch.half\n                        if (\n                            dtype == torch.half\n                            and tensor.dtype not in [torch.int8, torch.int32]\n                        )\n                        else tensor.dtype,\n                    )\n                    for i, tensor in enumerate(input_tensors)\n                ],\n                enabled_precisions=TORCH_TENSORRT_PRECISIONS[str(dtype)],\n                calibrator=calibrator\n                if quantization_type is QuantizationType.STATIC\n                else None,\n                workspace_size=self.device.get_free_memory(),\n                device={\n                    \"device_type\": torch_tensorrt.DeviceType.GPU,\n                    \"gpu_id\": self.device.idx,\n                    \"dla_core\": 0,\n                    \"allow_gpu_fallback\": False,\n                    \"disable_tf32\": False,\n                },\n                truncate_long_and_double=True,\n            )\n\n        # Delete calibration cache\n        if os.path.exists(\"calibration.cache\"):\n            os.remove(\"calibration.cache\")\n\n        return trt_model\n\n    @staticmethod\n    def _quantize_model(**kwargs) -> Any:\n        raise NotImplementedError\n\n\nclass ONNXTensorRTCompiler(TensorRTCompiler):\n    def __init__(self):\n        super().__init__()\n        self.model_orig = None\n        self.onnx_model_path = None\n        self.simplify_model = True\n\n    def execute(\n        self,\n        model: str,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation = None,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        is_diffusion: bool = False,\n        **kwargs,\n    ):\n        \"\"\"Compile the input model using TensorRT Compiler from the\n            ONNX interface.\n\n        Args:\n            model (str): The path to the onnx model.\n            model_params (ModelParams): The model parameters.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None\n            is_diffusion (bool): Whether the model is a diffusion model.\n                Default: False.\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n        train_input_data = input_data.get_split(\"train\").get_numpy_list(\n            QUANTIZATION_DATA_NUM\n        )\n\n        if self.simplify_model and not is_diffusion:\n            try:\n                import onnxsim  # noqa: F401\n\n                # Simplify model, otherwise tensor RT won't work\n                # on gpt2 and some other models.\n                simplified_model = str(model) + \"_simplified\"\n                if not Path(simplified_model).is_file():\n                    cmd = [\n                        \"onnxsim\",\n                        str(model),\n                        simplified_model,\n                    ]\n                    subprocess.run(cmd, stdout=subprocess.DEVNULL)\n\n                # First try with simplified model\n                self.onnx_model_path = simplified_model\n                assert os.path.isfile(self.onnx_model_path)\n            except Exception:\n                # Use original model\n                self.logger.warning(\n                    \"Unable to simplify model with ONNX Simplifier. \"\n                    \"Original ONNX model will be used to build \"\n                    \"TensorRT engine\"\n                )\n                self.onnx_model_path = str(model)\n            self.simplify_model = False\n        elif self.onnx_model_path is None:\n            self.onnx_model_path = str(model)\n\n        if is_diffusion:\n            if quantization_type is None:\n                self.logger.warning(\n                    \"Skipping float32 precision for Stable Diffusion, \"\n                    \"half precision will be used instead.\"\n                )\n                return\n            if quantization_type is QuantizationType.STATIC:\n                self.logger.warning(\n                    \"Skipping static quantization for Stable Diffusion \"\n                    \"because for now it's not supported.\"\n                )\n                return\n\n        if self.simplify_model and is_diffusion:\n            optimized_model = str(Path(model).parent / \"model_opt.onnx\")\n            unet = UNet(hf_token=None)\n            opt_graph = unet.optimize(onnx.load(str(model)))\n            try:\n                onnx.save(opt_graph, optimized_model)\n            except Exception:\n                onnx.save(\n                    opt_graph, optimized_model, save_as_external_data=True\n                )\n            self.onnx_model_path = optimized_model\n            self.simplify_model = False\n        elif self.onnx_model_path is None:\n            self.onnx_model_path = str(model)\n\n        # -- Build phase --\n        nvidia_logger = trt.Logger(trt.Logger.ERROR)\n        builder = trt.Builder(nvidia_logger)\n        # create network definition\n        network = builder.create_network(\n            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)\n        )\n        # build the engine\n        # TODO: setup config value for the class in a config file\n        config = builder.create_builder_config()\n        try:\n            config.set_memory_pool_limit(\n                trt.MemoryPoolType.WORKSPACE, self.device.get_free_memory()\n            )\n        except AttributeError:\n            # The method set_memory_pool_limit is not available\n            # until TensorRT Release 8.4.1\n            self.logger.warning(\n                \"Cannot call method set_memory_pool_limit for TensorRT. \"\n                \"because your version is lower than 8.4.1. \"\n                \"Please update TensorRT version.\"\n            )\n\n        if quantization_type is not None:\n            config = self._quantize_model(\n                quantization_type,\n                model_params,\n                config,\n                input_tfms,\n                train_input_data\n                if quantization_type is QuantizationType.STATIC\n                else None,\n            )\n\n        self.compiled_model = self._compile_model(\n            onnx_model_path=str(self.onnx_model_path),\n            model_params=model_params,\n            config=config,\n            network=network,\n            builder=builder,\n            nvidia_logger=nvidia_logger,\n        )\n        self.model_orig = self.onnx_model_path\n\n    def _compile_model(\n        self,\n        onnx_model_path: str,\n        model_params: ModelParams,\n        config,\n        network,\n        builder,\n        nvidia_logger,\n    ):\n        parser = trt.OnnxParser(network, nvidia_logger)\n        success = parser.parse_from_file(onnx_model_path)\n\n        if not success:\n            for idx in range(parser.num_errors):\n                self.logger.debug(parser.get_error(idx))\n            raise ValueError(\n                f\"Errors occurred while processing the \"\n                f\"ONNX file at {onnx_model_path}\"\n            )\n\n        if model_params.dynamic_info is not None:\n            inputs_shapes = self._extract_dynamic_shape_ranges(model_params)\n            profile = builder.create_optimization_profile()\n            for i, input_name in enumerate(get_input_names(onnx_model_path)):\n                profile.set_shape(\n                    input_name,\n                    inputs_shapes[i][\"min_shape\"],\n                    inputs_shapes[i][\"opt_shape\"],\n                    inputs_shapes[i][\"max_shape\"],\n                )\n            config.add_optimization_profile(profile)\n        return builder.build_serialized_network(network, config)\n\n    @staticmethod\n    def _quantize_model(\n        quantization_type: QuantizationType,\n        model_params: ModelParams,\n        config,\n        input_tfms: MultiStageTransformation,\n        input_data: List[Tuple[np.ndarray, ...]] = None,\n    ):\n        return quantize_tensorrt(\n            quantization_type,\n            model_params,\n            config,\n            input_tfms,\n            input_data,\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/tensorflow.py",
    "content": "from typing import List, Tuple\n\nfrom nebullvm.config import QUANTIZATION_DATA_NUM\nfrom nebullvm.core.models import QuantizationType\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\n\nfrom nebullvm.operations.optimizations.compilers.quantizations.tensorflow import (  # noqa: E501\n    quantize_tensorflow,\n)\nfrom nebullvm.operations.optimizations.compilers.quantizations.utils import (\n    check_quantization,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass TensorflowBackendCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [None],\n        \"gpu\": [None],\n    }\n\n    def execute(\n        self,\n        model: tf.Module,\n        input_tfms: MultiStageTransformation = None,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Optimize the input model using tensorflow built-in techniques.\n\n        Args:\n            model (tf.Module): The tensorflow model.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None.\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n\n        self.compiled_model = model\n\n    def _compile_model(self):\n        pass\n\n    @staticmethod\n    def _quantize_model(**kwargs):\n        raise NotImplementedError()\n\n\nclass TFLiteBackendCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [\n            None,\n            QuantizationType.STATIC,\n            QuantizationType.HALF,\n            QuantizationType.DYNAMIC,\n        ],\n        \"gpu\": [],\n    }\n\n    def execute(\n        self,\n        model: tf.Module,\n        input_tfms: MultiStageTransformation,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Optimize the input model using pytorch built-in techniques.\n\n        Args:\n            model (torch.nn.Module): The pytorch model. For avoiding un-wanted\n                modifications to the original model, it will be copied in the\n                method.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with an higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n        train_input_data = input_data.get_split(\"train\").get_list(\n            QUANTIZATION_DATA_NUM\n        )\n\n        if quantization_type is not None:\n            self.compiled_model = self._quantize_model(\n                model, quantization_type, train_input_data\n            )\n        else:\n            self.compiled_model = self._compile_model(model)\n\n    def _compile_model(\n        self,\n        model: tf.Module,\n    ):\n        converter = tf.lite.TFLiteConverter.from_keras_model(model)\n        tflite_model = converter.convert()\n        return tflite_model\n\n    @staticmethod\n    def _quantize_model(\n        model: tf.Module,\n        quantization_type: QuantizationType,\n        input_data_tensorflow: List[Tuple[tf.Tensor, ...]],\n    ):\n        return quantize_tensorflow(\n            model, quantization_type, input_data_tensorflow\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_dynamo.py",
    "content": "from typing import Union, Any\n\nfrom nebullvm.core.models import ModelParams, QuantizationType\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\n\nfrom nebullvm.optional_modules.torch import (\n    torch,\n    Module,\n    GraphModule,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass TorchDynamoCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [None],\n        \"gpu\": [None],\n    }\n\n    def execute(\n        self,\n        model: Module,\n        model_params: ModelParams,\n        input_tfms: MultiStageTransformation = None,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Optimize the input model using pytorch built-in techniques.\n\n        Args:\n            model (torch.nn.Module): The pytorch model.\n            model_params (ModelParams): The model parameters.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None.\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        self.compiled_model = self._compile_model(model, model_params)\n\n    @torch.no_grad()\n    def _compile_model(\n        self,\n        model: Union[Module, GraphModule],\n        network_parameters: ModelParams,\n    ) -> Any:\n        dynamic = False\n        if network_parameters.dynamic_info is not None:\n            dynamic = True\n        return torch.compile(model, dynamic=dynamic)\n\n    def _quantize_model(self, **kwargs) -> Any:\n        raise NotImplementedError\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_neuron.py",
    "content": "from typing import List, Tuple\n\nfrom nebullvm.core.models import QuantizationType, ModelParams, DeviceType\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\nfrom nebullvm.operations.optimizations.compilers.quantizations.utils import (\n    check_quantization,\n)\nfrom nebullvm.optional_modules.torch import (\n    torch,\n    symbolic_trace,\n)\nfrom nebullvm.optional_modules.torch_neuron import torch_neuron\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass TorchNeuronCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [],\n        \"gpu\": [],\n        \"neuron\": [None, QuantizationType.HALF],\n    }\n\n    @staticmethod\n    def _check_dynamic_shape(network_parameters: ModelParams) -> bool:\n        \"\"\"Handles case when model inputs have dynamic shapes.\n        For now TorchNeuron only supports dynamic shape for the\n        batch dimension.\n\n        Args:\n            network_parameters (ModelParams): The model parameters.\n\n        Returns:\n            bool: True if the model has dynamic batch size, False otherwise.\n        \"\"\"\n        if network_parameters.dynamic_info is None:\n            return False\n\n        for i, input_shape in enumerate(\n            network_parameters.dynamic_info.inputs\n        ):\n            if len(input_shape) > 1 or (\n                len(input_shape) == 1 and input_shape.get(0) is None\n            ):\n                raise ValueError(\n                    f\"TorchNeuronCompiler only supports dynamic shapes for \"\n                    f\"batch dimension. Provided dynamic info for input {i} \"\n                    f\"is: {input_shape}. Please use padding for the other \"\n                    f\"dimensions.\"\n                )\n\n        return True\n\n    def execute(\n        self,\n        model: torch.nn.Module,\n        model_params: ModelParams,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Optimize the input model using pytorch built-in techniques.\n\n        Args:\n            model (torch.nn.Module): The pytorch model.\n            model_params (ModelParams): The model parameters.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None.\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n        dynamic_batch_size = self._check_dynamic_shape(model_params)\n\n        self.compiled_model = self._compile_model(\n            model,\n            input_data,\n            quantization_type,\n            dynamic_batch_size=dynamic_batch_size,\n        )\n\n    @torch.no_grad()\n    def _compile_model(\n        self,\n        model: torch.nn.Module,\n        input_data: DataManager,\n        quantization_type: QuantizationType,\n        dynamic_batch_size: bool,\n    ) -> torch.jit.ScriptModule:\n        input_sample = input_data.get_list(1)[0]\n        if self.device.type is DeviceType.GPU:\n            if quantization_type is QuantizationType.HALF:\n                input_sample = [\n                    t.to(self.device.to_torch_format()).half()\n                    if torch.is_floating_point(t)\n                    else t.to(self.device.to_torch_format())\n                    for t in input_sample\n                ]\n            else:\n                input_sample = [\n                    t.to(self.device.to_torch_format()) for t in input_sample\n                ]\n            model.to(self.device.to_torch_format())\n        model.eval()\n\n        try:\n            model_scripted = symbolic_trace(model)\n            model_scripted = torch_neuron.trace(\n                model_scripted,\n                input_sample,\n                dynamic_batch_size=dynamic_batch_size,\n                compiler_args=[\"--fast-math\", \"none\"]\n                if quantization_type is None\n                else None,\n            )\n        except Exception:\n            try:\n                model_scripted = torch_neuron.trace(\n                    model,\n                    input_sample,\n                    dynamic_batch_size=dynamic_batch_size,\n                    compiler_args=[\"--fast-math\", \"none\"]\n                    if quantization_type is None\n                    else None,\n                )\n            except Exception:\n                raise RuntimeError(\"Unable to trace model with torch_neuron.\")\n\n        return model_scripted\n\n    @torch.no_grad()\n    def _quantize_model(\n        self,\n        model: torch.nn.Module,\n        quantization_type: QuantizationType,\n        input_tfms: MultiStageTransformation,\n        input_data_torch: List[Tuple[torch.Tensor, ...]],\n    ):\n        raise NotImplementedError()\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_xla.py",
    "content": "from nebullvm.core.models import QuantizationType\nfrom nebullvm.operations.optimizations.compilers.torchscript import (\n    TorchScriptCompiler,\n)\nfrom nebullvm.optional_modules.torch import (\n    torch,\n)\nfrom nebullvm.tools.data import DataManager\n\n\nclass TorchXLACompiler(TorchScriptCompiler):\n    supported_ops = {\n        \"cpu\": [],\n        \"gpu\": [],\n        \"tpu\": [None, QuantizationType.HALF],\n    }\n\n    @torch.no_grad()\n    def _compile_model(\n        self,\n        model: torch.nn.Module,\n        input_data: DataManager,\n        quantization_type: QuantizationType,\n    ) -> torch.nn.Module:\n        compiled_model = model.to(self.device.to_torch_format())\n        return compiled_model\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/torchscript.py",
    "content": "from typing import Union, List, Tuple\n\nfrom nebullvm.config import QUANTIZATION_DATA_NUM\nfrom nebullvm.core.models import QuantizationType, DeviceType\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\n\nfrom nebullvm.operations.optimizations.compilers.quantizations.pytorch import (\n    quantize_pytorch,\n)\nfrom nebullvm.operations.optimizations.compilers.quantizations.utils import (\n    check_quantization,\n)\nfrom nebullvm.optional_modules.torch import (\n    torch,\n    Module,\n    ScriptModule,\n    GraphModule,\n    symbolic_trace,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass TorchScriptCompiler(Compiler):\n    supported_ops = {\n        \"cpu\": [None, QuantizationType.STATIC, QuantizationType.DYNAMIC],\n        \"gpu\": [\n            None,\n            QuantizationType.HALF,\n        ],\n    }\n\n    def execute(\n        self,\n        model: Module,\n        input_tfms: MultiStageTransformation = None,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Optimize the input model using pytorch built-in techniques.\n\n        Args:\n            model (torch.nn.Module): The pytorch model.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None.\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n        train_input_data = input_data.get_split(\"train\").get_list(\n            QUANTIZATION_DATA_NUM\n        )\n\n        if quantization_type is not None:\n            model = self._quantize_model(\n                model, quantization_type, input_tfms, train_input_data\n            )\n\n        self.compiled_model = self._compile_model(\n            model, input_data, quantization_type\n        )\n\n    @torch.no_grad()\n    def _compile_model(\n        self,\n        model: Union[Module, GraphModule],\n        input_data: DataManager,\n        quantization_type: QuantizationType,\n    ) -> ScriptModule:\n        input_sample = input_data.get_list(1)[0]\n        if self.device.type is DeviceType.GPU:\n            if quantization_type is QuantizationType.HALF:\n                input_sample = [\n                    t.to(self.device.to_torch_format()).half()\n                    if torch.is_floating_point(t)\n                    else t.to(self.device.to_torch_format())\n                    for t in input_sample\n                ]\n            else:\n                input_sample = [\n                    t.to(self.device.to_torch_format()) for t in input_sample\n                ]\n            model.to(self.device.to_torch_format())\n\n        if not isinstance(model, torch.fx.GraphModule):\n            model.eval()\n            try:\n                model_scripted = symbolic_trace(model)\n                model_scripted = torch.jit.script(model_scripted)\n            except Exception:\n                if quantization_type is None:\n                    self.logger.warning(\"Unable to trace model with torch.fx\")\n                try:\n                    model_scripted = torch.jit.script(model)\n                except Exception:\n                    model_scripted = torch.jit.trace(model, input_sample)\n        else:\n            model_scripted = torch.jit.script(model)\n\n        return model_scripted\n\n    @torch.no_grad()\n    def _quantize_model(\n        self,\n        model: Module,\n        quantization_type: QuantizationType,\n        input_tfms: MultiStageTransformation,\n        input_data_torch: List[Tuple[torch.Tensor, ...]],\n    ):\n        return quantize_pytorch(\n            model, quantization_type, input_tfms, input_data_torch, self.device\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/tvm.py",
    "content": "import abc\nimport os\nimport uuid\nfrom abc import ABC\nfrom typing import Any, Tuple, Dict, Union\n\nfrom nebullvm.config import (\n    AUTO_TVM_PARAMS,\n    AUTO_TVM_TUNING_OPTION,\n)\nfrom nebullvm.core.models import (\n    QuantizationType,\n    ModelParams,\n    DeviceType,\n    Device,\n)\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\nfrom nebullvm.operations.optimizations.compilers.quantizations.tvm import (\n    TVMCalibrator,\n    quantize_apache_tvm,\n)\nfrom nebullvm.operations.optimizations.compilers.quantizations.utils import (\n    check_quantization,\n)\nfrom nebullvm.optional_modules.onnx import onnx\nfrom nebullvm.optional_modules.torch import Module, torch\nfrom nebullvm.optional_modules.tvm import (\n    tvm,\n    IRModule,\n    NDArray,\n    XGBTuner,\n    autotvm,\n    relay,\n    ExecutorFactoryModule,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.onnx import get_input_names\nfrom nebullvm.tools.pytorch import create_model_inputs_torch\nfrom nebullvm.tools.transformations import MultiStageTransformation\n\n\nclass ApacheTVMCompiler(Compiler, ABC):\n    supported_ops = {\n        \"cpu\": [\n            None,\n            # QuantizationType.STATIC,\n            QuantizationType.HALF,\n            QuantizationType.DYNAMIC,\n        ],\n        \"gpu\": [\n            None,\n            # QuantizationType.STATIC,\n            QuantizationType.HALF,\n            QuantizationType.DYNAMIC,\n        ],\n    }\n\n    def __init__(self):\n        super().__init__()\n        self.model_orig = None\n\n    def execute(\n        self,\n        model: Union[Module, str],\n        input_tfms: MultiStageTransformation,\n        model_params: ModelParams,\n        metric_drop_ths: float = None,\n        quantization_type: QuantizationType = None,\n        input_data: DataManager = None,\n        **kwargs,\n    ):\n        \"\"\"Compile the input model using Apache TVM compiler.\n\n        Args:\n            model (Union[Module, str]: The input model. Can be a torch model\n                or a path to an onnx model.\n            input_tfms (MultiStageTransformation, optional): Transformations\n                to be performed to the model's input tensors in order to\n                get the prediction. Default: None.\n            model_params (ModelParams): Model parameters.\n            metric_drop_ths (float, optional): Threshold for the accepted drop\n                in terms of precision. Any optimized model with a higher drop\n                will be ignored. Default: None.\n            quantization_type (QuantizationType, optional): The desired\n                quantization algorithm to be used. Default: None.\n            input_data (DataManager): User defined data. Default: None\n        \"\"\"\n\n        if quantization_type not in self.supported_ops[self.device.type.value]:\n            self.compiled_model = None\n            return\n\n        if quantization_type is QuantizationType.STATIC and input_data is None:\n            raise ValueError(\"Input data is required for static quantization.\")\n\n        self.logger.info(\n            f\"Optimizing with {self.__class__.__name__} and \"\n            f\"q_type: {quantization_type}.\"\n        )\n\n        check_quantization(quantization_type, metric_drop_ths)\n\n        mod, params = self._build_tvm_model(model, model_params)\n\n        if quantization_type is not None:\n            mod = self._quantize_model(\n                mod, quantization_type, input_tfms, input_data, params\n            )\n\n        self.compiled_model = self._compile_model(mod, params)\n\n    @abc.abstractmethod\n    def _build_tvm_model(self, model: Any, model_params: ModelParams):\n        raise NotImplementedError()\n\n    @staticmethod\n    def _build_tvm_model_from_torch(\n        torch_model: Module, model_params: ModelParams, device: Device\n    ) -> Tuple[IRModule, Dict[str, NDArray]]:\n        shape_dict = {\n            f\"input_{i}\": input_size\n            for i, input_size in enumerate(model_params.input_sizes)\n        }\n        inputs = tuple(create_model_inputs_torch(model_params.input_infos))\n        if device.type is not DeviceType.GPU:\n            inputs = tuple(input_.cpu() for input_ in inputs)\n            torch_model.cpu()\n        else:\n            inputs = tuple(\n                input_.to(device.to_torch_format()) for input_ in inputs\n            )\n            torch_model.to(device.to_torch_format())\n        with torch.no_grad():\n            _ = torch_model(*inputs)\n            model_trace = torch.jit.trace(torch_model, inputs)\n            model_trace.eval()\n        mod, params = relay.frontend.from_pytorch(\n            model_trace, list(shape_dict.items())\n        )\n        return mod, params\n\n    @staticmethod\n    def _build_tvm_model_from_onnx(\n        onnx_model_path: str, model_params: ModelParams\n    ) -> Tuple[IRModule, Dict[str, NDArray]]:\n        shape_dict = {\n            input_key: input_size\n            for input_key, input_size in zip(\n                get_input_names(onnx_model_path), model_params.input_sizes\n            )\n        }\n        onnx_model = onnx.load(onnx_model_path)\n        mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)\n        return mod, params\n\n    @staticmethod\n    def _quantize(\n        mod: IRModule,\n        params: Dict[str, NDArray],\n        input_data: TVMCalibrator = None,\n    ) -> IRModule:\n        if input_data is not None:\n            with relay.quantize.qconfig(\n                calibrate_mode=\"kl_divergence\", weight_scale=\"max\"\n            ):\n                mod = relay.quantize.quantize(mod, params, dataset=input_data)\n        else:\n            with relay.quantize.qconfig(\n                calibrate_mode=\"global_scale\", global_scale=8.0\n            ):\n                mod = relay.quantize.quantize(mod, params)\n        return mod\n\n    @staticmethod\n    def _get_target(device) -> str:\n        if device.type is DeviceType.GPU:\n            return str(tvm.target.cuda())\n        else:\n            return \"llvm\"  # run on CPU\n\n    @staticmethod\n    def _tune_tvm_model(\n        target: str, mod: IRModule, params: Dict[str, NDArray]\n    ) -> str:\n        \"\"\"Tune the model using AutoTVM.\"\"\"\n        # TODO: add support to Ansor\n        tuning_records = f\"{uuid.uuid4()}_model_records.json\"\n        # create a TVM runner\n        runner = autotvm.LocalRunner(\n            number=AUTO_TVM_PARAMS[\"number\"],\n            repeat=AUTO_TVM_PARAMS[\"repeat\"],\n            timeout=AUTO_TVM_PARAMS[\"timeout\"],\n            min_repeat_ms=AUTO_TVM_PARAMS[\"min_repeat_ms\"],\n            # TODO modify min_repeat_ms for GPU usage\n            enable_cpu_cache_flush=True,\n        )\n        # begin by extracting the tasks from the onnx model\n        tasks = autotvm.task.extract_from_program(\n            mod[\"main\"], target=target, params=params\n        )\n\n        # Tune the extracted tasks sequentially.\n        for i, task in enumerate(tasks):\n            tuner_obj = XGBTuner(task, loss_type=\"rank\")\n            tuner_obj.tune(\n                n_trial=min(\n                    AUTO_TVM_TUNING_OPTION[\"trials\"], len(task.config_space)\n                ),\n                early_stopping=AUTO_TVM_TUNING_OPTION[\"early_stopping\"],\n                measure_option=autotvm.measure_option(\n                    builder=autotvm.LocalBuilder(build_func=\"default\"),\n                    runner=runner,\n                ),\n                callbacks=[\n                    autotvm.callback.log_to_file(tuning_records),\n                ],\n            )\n        return tuning_records\n\n    def _compile_model(self, model: Any, params: Any) -> ExecutorFactoryModule:\n        target = self._get_target(self.device)\n        tuning_records = self._tune_tvm_model(target, model, params)\n        with autotvm.apply_history_best(tuning_records):\n            with tvm.transform.PassContext(opt_level=3, config={}):\n                lib = relay.build(model, target=target, params=params)\n\n        # Remove temporary file created by tvm\n        os.remove(tuning_records)\n\n        return lib\n\n    @staticmethod\n    def _quantize_model(\n        model: Any,\n        quantization_type: QuantizationType,\n        input_tfms: MultiStageTransformation,\n        input_data: DataManager,\n        params,\n    ):\n        return quantize_apache_tvm(\n            model, quantization_type, input_tfms, input_data, params\n        )\n\n\nclass PyTorchApacheTVMCompiler(ApacheTVMCompiler):\n    def _build_tvm_model(self, model: Any, model_params: ModelParams):\n        return self._build_tvm_model_from_torch(\n            model, model_params, self.device\n        )\n\n\nclass ONNXApacheTVMCompiler(ApacheTVMCompiler):\n    def _build_tvm_model(self, model: Any, model_params: ModelParams):\n        self.model_orig = model\n        return self._build_tvm_model_from_onnx(model, model_params)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compilers/utils.py",
    "content": "from pathlib import Path\n\nimport nebullvm\nfrom nebullvm.core.models import Device, ModelCompiler, DeviceType\n\n\ndef onnxruntime_is_available() -> bool:\n    try:\n        import onnxruntime  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef tvm_is_available() -> bool:\n    try:\n        import tvm  # noqa F401\n        from tvm.runtime import Module  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef bladedisc_is_available() -> bool:\n    try:\n        import torch_blade  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef tensorrt_is_available() -> bool:\n    try:\n        import polygraphy  # noqa F401\n        import tensorrt  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef torch_tensorrt_is_available() -> bool:\n    try:\n        import torch_tensorrt  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef openvino_is_available() -> bool:\n    try:\n        from openvino.runtime import Core  # noqa F401\n    except ImportError:\n        return False\n    else:\n        return True\n\n\ndef deepsparse_is_available() -> bool:\n    try:\n        import deepsparse  # noqa F401\n    except ImportError:\n        return False\n    else:\n        return True\n\n\ndef intel_neural_compressor_is_available() -> bool:\n    try:\n        import neural_compressor  # noqa F401\n    except ImportError:\n        return False\n    else:\n        return True\n\n\ndef torch_xla_is_available():\n    try:\n        import torch_xla  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef torch_neuron_is_available():\n    try:\n        import torch_neuron  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef get_faster_transformer_repo_path() -> Path:\n    return Path(nebullvm.__file__).parent.joinpath(\"FasterTransformer\")\n\n\ndef faster_transformer_is_available() -> bool:\n    return (\n        get_faster_transformer_repo_path()\n        .parent.joinpath(\"FasterTransformer_build_success\")\n        .exists()\n    )\n\n\ndef select_compilers_from_hardware_onnx(device: Device):\n    from nebullvm.optional_modules.utils import onnx_is_available\n\n    compilers = []\n    if onnx_is_available():\n        if onnxruntime_is_available():\n            compilers.append(ModelCompiler.ONNX_RUNTIME)\n        if tvm_is_available():\n            compilers.append(ModelCompiler.APACHE_TVM)\n        if device.type is DeviceType.GPU and tensorrt_is_available():\n            compilers.append(ModelCompiler.TENSOR_RT)\n        if device.type is DeviceType.CPU and openvino_is_available():\n            compilers.append(ModelCompiler.OPENVINO)\n    return compilers\n\n\ndef select_compilers_from_hardware_torch(device: Device):\n    from nebullvm.optional_modules.utils import torch_is_available\n\n    compilers = []\n    if torch_is_available():\n        compilers.append(ModelCompiler.TORCHSCRIPT)\n        if tvm_is_available():\n            compilers.append(ModelCompiler.APACHE_TVM)\n        if bladedisc_is_available():\n            compilers.append(ModelCompiler.BLADEDISC)\n        if torch_neuron_is_available():\n            compilers.append(ModelCompiler.TORCH_NEURON)\n\n        if device.type is DeviceType.CPU:\n            if deepsparse_is_available():\n                compilers.append(ModelCompiler.DEEPSPARSE)\n            if intel_neural_compressor_is_available():\n                compilers.append(ModelCompiler.INTEL_NEURAL_COMPRESSOR)\n        elif device.type is DeviceType.GPU:\n            if torch_tensorrt_is_available:\n                compilers.append(ModelCompiler.TENSOR_RT)\n    return compilers\n\n\ndef select_compilers_from_hardware_tensorflow():\n    from nebullvm.optional_modules.utils import tensorflow_is_available\n\n    compilers = []\n    if tensorflow_is_available():\n        compilers.append(ModelCompiler.XLA)\n        compilers.append(ModelCompiler.TFLITE)\n    return compilers\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compressors/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compressors/base.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import Any, Optional, Dict, Callable, Tuple\n\nimport yaml\n\nfrom nebullvm.operations.base import Operation\nfrom nebullvm.tools.data import DataManager\n\n\nclass Compressor(Operation, ABC):\n    def __init__(self, config_file: str = None):\n        super().__init__()\n        self._config = self._read_config(config_file)\n        self.compressed_model = None\n        self.new_metric_ths = None\n\n    @abstractmethod\n    def execute(\n        self,\n        model: Any,\n        train_input_data: DataManager,\n        eval_input_data: DataManager,\n        metric_drop_ths: float,\n        metric: Callable,\n    ) -> Tuple[Any, Optional[float]]:\n        raise NotImplementedError()\n\n    def _read_config(self, config_file: Optional[str]) -> Dict:\n        config = self._get_default_config()\n        if config_file is not None:\n            with open(config_file, \"r\") as f:\n                data = yaml.load(f, Loader=yaml.CLoader)\n                config.update(data.get(self.config_key, {}))\n        return config\n\n    @staticmethod\n    @abstractmethod\n    def _get_default_config() -> Dict:\n        raise NotImplementedError\n\n    @property\n    @abstractmethod\n    def config_key(self) -> str:\n        raise NotImplementedError()\n\n    def get_result(self) -> Tuple[Any, Optional[float]]:\n        return self.compressed_model, self.new_metric_ths\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compressors/intel.py",
    "content": "import copy\nimport re\nfrom abc import ABC, abstractmethod\nfrom pathlib import Path\nfrom tempfile import mkdtemp\nfrom typing import Dict, Any, Callable\n\nimport numpy as np\nimport yaml\n\nfrom nebullvm.operations.optimizations.compressors.base import Compressor\nfrom nebullvm.optional_modules.neural_compressor import Pruning\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import DataLoader, Dataset, Module\nfrom nebullvm.tools.data import DataManager\n\n\ndef _get_model_framework(model: Any) -> str:\n    if isinstance(model, Module):\n        return \"torch\"\n    elif isinstance(model, tf.Module) and model is not None:\n        return \"tensorflow\"\n    else:\n        return \"numpy\"\n\n\nclass IntelPruningCompressor(Compressor, ABC):\n    def __init__(self, config_file: str = None):\n        super().__init__(config_file)\n        self._temp_dir = mkdtemp()\n\n    @property\n    def config_key(self) -> str:\n        return \"intel_pruning\"\n\n    @staticmethod\n    def _get_default_config() -> Dict:\n        # see https://github.com/intel/neural-compressor/blob/master/neural_compressor/conf/config.py  # noqa\n        # for further details\n        config = {\n            \"train\": {\n                \"optimizer\": {\n                    \"SGD\": {\"learning_rate\": 0.001},\n                },\n                \"criterion\": {\n                    \"CrossEntropyLoss\": {\n                        \"reduction\": \"mean\",\n                        \"from_logits\": False,\n                    },\n                },\n                \"epoch\": 10,\n                \"start_epoch\": 0,\n                \"end_epoch\": 10,\n                \"iteration\": 30,\n                \"execution_mode\": \"eager\",  # either eager or graph\n                # \"hostfile\": None,  # str for multinode training support\n            },\n            \"approach\": {\n                \"weight_compression\": {\n                    \"initial_sparsity\": 0.0,\n                    \"target_sparsity\": 0.60,\n                    \"start_epoch\": 0,\n                    \"end_epoch\": 8,\n                    \"pruners\": [\n                        {\n                            \"start_epoch\": 0,\n                            \"end_epoch\": 8,\n                            \"prune_type\": \"basic_magnitude\",\n                        },\n                    ],\n                }\n            },\n        }\n        return config\n\n    def _prepare_pruning_config(self, model: Any):\n        pruning_config = copy.deepcopy(self._config)\n        framework = _get_model_framework(model)\n        config = {\n            \"model\": {\n                \"name\": model.__class__.__name__,\n                \"framework\": framework if framework != \"torch\" else \"pytorch\",\n            },\n            \"evaluation\": {\"accuracy\": {\"metric\": {\"topk\": 1}}},\n            \"device\": \"cpu\",\n            \"tuning\": {\n                \"random_seed\": 1978,\n                \"tensorboard\": False,\n                \"workspace\": {\"path\": self._temp_dir},\n            },\n            \"pruning\": pruning_config,\n        }\n        path_file = Path(self._temp_dir) / \"temp.yaml\"\n        with open(path_file, \"w\") as f:\n            yaml.dump(config, f)\n        with open(path_file, \"r+\") as f:\n            file_str = f.read()\n            file_str = re.sub(\n                \"pruners:\\n      - end_epoch:\",\n                \"pruners:\\n      - !Pruner\\n        end_epoch:\",\n                file_str,\n            )\n            f.seek(0)\n            f.write(file_str)\n        return path_file\n\n    def execute(\n        self,\n        model: Any,\n        train_input_data: DataManager,\n        eval_input_data: DataManager,\n        metric_drop_ths: float,\n        metric: Callable,\n    ):\n        config_file_pr = self._prepare_pruning_config(model)\n        prune = Pruning(str(config_file_pr))\n        prune.model = model\n        prune.train_dataloader = self._get_dataloader(train_input_data)\n        prune.eval_dataloader = self._get_dataloader(eval_input_data)\n        self.compressed_model = prune.fit()\n\n        if self.compressed_model is not None:\n            error = self._compute_error(\n                model, self.compressed_model, eval_input_data, metric\n            )\n            if error > metric_drop_ths:\n                self.compressed_model = None\n            else:\n                self.new_metric_ths = metric_drop_ths - error\n\n    @abstractmethod\n    def _compute_error(\n        self,\n        model: Any,\n        compressed_model: Any,\n        eval_input_data: DataManager,\n        metric: Callable,\n    ):\n        raise NotImplementedError\n\n    @staticmethod\n    @abstractmethod\n    def _get_dataloader(input_data: DataManager):\n        raise NotImplementedError\n\n\nclass INCDataset(Dataset):\n    def __init__(self, input_data: DataManager):\n        self.data = input_data\n        self.batch_size = input_data[0][0][0].shape[0]\n\n    def __len__(self):\n        return sum([batch_inputs[0].shape[0] for batch_inputs, _ in self.data])\n\n    def __getitem__(self, idx):\n        batch_idx = int(idx / self.batch_size)\n        item_idx = idx % self.batch_size\n        data = tuple([data[item_idx] for data in self.data[batch_idx][0]])\n        return data, self.data[batch_idx][1][item_idx]\n\n\nclass TorchIntelPruningCompressor(IntelPruningCompressor):\n    @staticmethod\n    def _get_dataloader(input_data: DataManager):\n        bs = input_data[0][0][0].shape[0]\n        ds = INCDataset(input_data)\n        dl = DataLoader(ds, bs)\n        return dl\n\n    def _compute_error(\n        self,\n        model: Module,\n        compressed_model: Module,\n        eval_input_data: DataManager,\n        metric: Callable,\n    ):\n        if len(eval_input_data) == 0:\n            return np.inf\n        metric_val = 0\n        for inputs, y in eval_input_data:\n            pred_model = model(*inputs)\n            pred_compressed_model = compressed_model(*inputs)\n            metric_val += metric(pred_model, pred_compressed_model, y)\n        return metric_val / len(eval_input_data)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compressors/scripts/__init__.py",
    "content": "import json\nimport logging\nimport os.path\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Tuple, List, Any, Dict\n\nimport torch\nfrom sparseml.onnx.optim import ModelAnalyzer, pruning_loss_sens_magnitude\nfrom sparseml.pytorch.optim import (\n    ScheduledModifierManager,\n)\nfrom sparseml.pytorch.sparsification import (\n    EpochRangeModifier,\n    GMPruningModifier,\n)\nfrom sparseml.pytorch.utils import ModuleExporter\nfrom sparsify.blueprints.utils import (\n    default_epochs_distribution,\n    PruningModelEvaluator,\n    default_pruning_settings,\n)\nfrom sparsify.schemas import ProjectModelAnalysisSchema\nfrom torch.nn import CrossEntropyLoss, MSELoss\nfrom torch.optim import SGD\nfrom tqdm.auto import tqdm\n\nCRITERION_FNS = {\n    \"CrossEntropy\": CrossEntropyLoss(),\n    \"MSE\": MSELoss(),\n}\n\nlogging.basicConfig(\n    format=\" %(asctime)s [%(levelname)s] %(message)s\",\n    datefmt=\"%d/%m/%Y %I:%M:%S %p\",\n)\nlogger = logging.getLogger(\"nebullvm_logger\")\nlogger.setLevel(logging.INFO)\n\n\ndef _export_model_onnx(\n    model: torch.nn.Module,\n    save_path: Path,\n    model_name: str,\n    input_batch: Tuple,\n):\n    if torch.cuda.is_available():\n        input_batch = tuple(t.cuda() for t in input_batch)\n        model.cuda()\n\n    exporter = ModuleExporter(model, output_dir=save_path)\n    with torch.no_grad():\n        example_outputs = model(*input_batch)\n    exporter.export_onnx(\n        input_batch, name=model_name, example_outputs=example_outputs\n    )\n    onnx_path = save_path / model_name\n\n    return onnx_path\n\n\nclass RecipeBuilder:\n    def __init__(self, model_path):\n        self.model_path = model_path\n\n    def _make_analysis(self):\n        analyzer = ModelAnalyzer(self.model_path)\n        self.analysis = ProjectModelAnalysisSchema().load(analyzer.dict())\n\n    def _compute_loss_sensitivity(self):\n        sensitivities = []\n        parameters = []\n        for i, node in enumerate(self.analysis[\"nodes\"]):\n            if node[\"prunable\"]:\n                sensitivities.append(node[\"prunable_equation_sensitivity\"])\n                parameters.append(node[\"prunable_params\"])\n\n        loss_analysis = pruning_loss_sens_magnitude(self.model_path)\n\n        results_model = loss_analysis.results_model\n        results = loss_analysis.results\n\n        model = {\n            \"baseline_measurement_key\": (\n                str(results_model.baseline_measurement_key)\n            ),\n            \"measurements\": {\n                str(key): val for key, val in results_model.averages.items()\n            },\n        }\n        ops = []\n\n        for res in results:\n            ops.append(\n                {\n                    \"id\": res.id_,\n                    \"name\": res.name,\n                    \"index\": res.index,\n                    \"baseline_measurement_key\": (\n                        str(res.baseline_measurement_key)\n                    ),\n                    \"measurements\": {\n                        str(key): val for key, val in res.averages.items()\n                    },\n                }\n            )\n\n        pruning = {\"model\": model, \"ops\": ops}\n        loss = {}\n        loss[\"baseline\"] = {}\n        loss[\"pruning\"] = pruning\n\n        model = PruningModelEvaluator(\n            self.analysis,\n            None,\n            loss,\n        )\n        model.eval_baseline(default_pruning_settings().sparsity)\n        model.eval_pruning(default_pruning_settings())\n\n        self.final_analysis = model.to_dict_values()\n\n    def build_recipe(self, epochs_pruning_window=None, training_epochs=10):\n        self._make_analysis()\n        self._compute_loss_sensitivity()\n\n        if epochs_pruning_window is None:\n            epochs = default_epochs_distribution(training_epochs)\n        else:\n            # TODO: set custom parameters\n            epochs = default_epochs_distribution(training_epochs)\n            epochs_dict = epochs._asdict()\n            epochs_dict.update(epochs_pruning_window)\n            epochs = epochs.__class__(**epochs_dict)\n\n        mods = [\n            EpochRangeModifier(\n                start_epoch=epochs.start_epoch,\n                end_epoch=epochs.end_epoch,\n            )\n        ]\n\n        node_weight_name_lookup = {\n            node[\"id\"]: node[\"weight_name\"]\n            for node in self.analysis[\"nodes\"]\n            if node[\"prunable\"]\n        }\n\n        sparsity_to_params = {}\n\n        nodes = self.final_analysis[0]\n\n        for node in nodes:\n            sparsity = node[\"sparsity\"]\n            node_id = node[\"node_id\"]\n            weight_name = node_weight_name_lookup[node_id]\n\n            if sparsity is None:\n                continue\n\n            if sparsity not in sparsity_to_params:\n                sparsity_to_params[sparsity] = []\n\n            sparsity_to_params[sparsity].append(weight_name)\n\n        for sparsity, params in sparsity_to_params.items():\n            gm_pruning = GMPruningModifier(\n                init_sparsity=0.05,\n                final_sparsity=sparsity,\n                start_epoch=epochs.pruning_start_epoch,\n                end_epoch=epochs.pruning_end_epoch,\n                update_frequency=epochs.pruning_update_frequency,\n                params=params,\n            )\n\n            mods.append(gm_pruning)\n\n        return ScheduledModifierManager(mods)\n\n\nclass PruningTrainer:\n    def __init__(self, model, bs):\n        self.data_loader = None\n        self.optimizer = None\n        self.model = model\n        self.batch_size = bs\n\n    def _setup_training(self, loss_fn=None, lr=1e-3, momentum=0.9):\n        self.device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n        self.model.to(self.device)\n        if loss_fn is None:\n            loss_fn = CrossEntropyLoss()\n        else:\n            loss_fn = CRITERION_FNS.get(loss_fn, CrossEntropyLoss())\n        self.criterion = loss_fn\n        self.optimizer = SGD(self.model.parameters(), lr=lr, momentum=momentum)\n\n    def _run_model_one_epoch(self, train=False):\n\n        if train:\n            self.model.train()\n            data_loader = self.train_data_loader\n        else:\n            self.model.eval()\n            data_loader = self.val_data_loader\n\n        running_loss = 0.0\n\n        for step, (inputs, labels) in tqdm(\n            enumerate(data_loader), total=len(data_loader)\n        ):\n            inputs = tuple(t.to(self.device) for t in inputs)\n            if not isinstance(labels, torch.Tensor):\n                labels = torch.tensor(labels)\n                if len(labels.shape) == 0:\n                    labels = labels.unsqueeze(0)\n            labels = labels.to(self.device)\n\n            if train:\n                self.optimizer.zero_grad()\n\n            outputs = self.model(\n                *inputs\n            )  # model returns logits and softmax as a tuple\n            loss = self.criterion(outputs, labels)\n\n            if train:\n                loss.backward()\n                self.optimizer.step()\n\n            running_loss += loss.item()\n\n        loss = running_loss / (len(data_loader) + 1e-5)\n        return loss\n\n    def train(\n        self, manager, train_data_loader, val_data_loader, **train_kwargs\n    ):\n        self.train_data_loader = train_data_loader\n        self.val_data_loader = val_data_loader\n        self._setup_training(**train_kwargs)\n        self.optimizer = manager.modify(\n            self.model,\n            self.optimizer,\n            steps_per_epoch=len(self.train_data_loader),\n        )\n        self.model.train()\n        # Run model pruning\n        epoch = manager.min_epochs\n        while epoch < manager.max_epochs:\n            # run training loop\n            epoch_name = \"{}/{}\".format(epoch + 1, manager.max_epochs)\n            logger.info(\"Running Training Epoch {}\".format(epoch_name))\n            train_loss = self._run_model_one_epoch(train=True)\n            logger.info(\n                (\"Training Epoch: {}\\nTraining Loss: {}\\n\").format(\n                    epoch_name, train_loss\n                )\n            )\n\n            # run validation loop\n            logger.info(\"Running Validation Epoch {}\".format(epoch_name))\n            val_loss = self._run_model_one_epoch()\n            logger.info(\n                \"Validation Epoch: {}\\nVal Loss: {}\\n\".format(\n                    epoch_name, val_loss\n                )\n            )\n\n            epoch += 1\n\n        manager.finalize(self.model)\n\n        return self.model\n\n\ndef _load_config(config_file: str):\n    with open(config_file, \"r\") as f:\n        config = json.load(f)\n    return config\n\n\ndef _load_data(data_dir: str):\n    data_dir = Path(data_dir)\n    return [torch.load(input_path) for input_path in data_dir.glob(\"*.pt\")]\n\n\ndef _load_model(model_file: str):\n    if os.path.isdir(model_file):\n        path = Path(model_file)\n        module_file = path / \"module.py\"\n        with open(module_file, \"r\") as f:\n            module_str = f.read()\n        exec(module_str, globals())\n        model = eval(\"NebullvmFxModule\")()\n        model.load_state_dict(torch.load(path / \"state_dict.pt\"))\n    else:\n        model = torch.load(model_file)\n    return model\n\n\ndef _train_model(\n    model: torch.nn.Module,\n    train_data: List[Tuple[Tuple, Any]],\n    eval_data: List[Tuple[Tuple, Any]],\n    epochs_pruning_window: Dict = None,\n    training_epochs: int = 10,\n    lr: float = 1e-3,\n    momentum: float = 0.9,\n    loss_fn: str = \"CrossEntropy\",\n):\n    batch_size = train_data[0][0][0].shape[0]\n    with TemporaryDirectory() as tmp_dir:\n        onnx_path = _export_model_onnx(\n            model, Path(tmp_dir), \"model.onnx\", train_data[0][0]\n        )\n        onnx_path = onnx_path.as_posix()\n\n        recipe = RecipeBuilder(onnx_path)\n        # TODO: implement custom parameters support\n        manager = recipe.build_recipe(\n            epochs_pruning_window=epochs_pruning_window,\n            training_epochs=training_epochs,\n        )\n        trainer = PruningTrainer(model, batch_size)\n        pruned_model = trainer.train(\n            manager, train_data, eval_data, lr=lr, momentum=momentum\n        )\n        return pruned_model\n\n\ndef _save_model(model: torch.nn.Module, path: str):\n    if path.endswith(\".pt\"):\n        torch.save(model, path)\n    else:\n        torch.save(model.state_dict(), Path(path) / \"pruned_state_dict.pt\")\n\n\ndef main(\n    model_file: str,\n    train_data_dir: str,\n    eval_data_dir: str,\n    config_file: str,\n    out_file: str,\n):\n    config = _load_config(config_file)\n    model = _load_model(model_file)\n    train_data = _load_data(train_data_dir)\n    eval_data = _load_data(eval_data_dir)\n    pruned_model = _train_model(model, train_data, eval_data, **config)\n    _save_model(pruned_model, out_file)\n\n\nif __name__ == \"__main__\":\n    from argparse import ArgumentParser\n\n    parser = ArgumentParser()\n    parser.add_argument(\"--model\", help=\"The model to be pruned.\")\n    parser.add_argument(\n        \"--train_dir\",\n        help=\"The directory contained the pickled training data.\",\n    )\n    parser.add_argument(\n        \"--eval_dir\", help=\"The directory contained the pickled test data.\"\n    )\n    parser.add_argument(\"--config\", help=\"The config file.\")\n    parser.add_argument(\n        \"--pruned_model\", help=\"Path where storing the pruned model.\"\n    )\n    args = parser.parse_args()\n    main(\n        model_file=args.model,\n        train_data_dir=args.train_dir,\n        eval_data_dir=args.eval_dir,\n        config_file=args.config,\n        out_file=args.pruned_model,\n    )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compressors/scripts/neural_magic_training.py",
    "content": "import json\nimport logging\nimport os.path\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Tuple, List, Any, Dict\n\nimport torch\nfrom sparseml.onnx.optim import ModelAnalyzer, pruning_loss_sens_magnitude\nfrom sparseml.pytorch.optim import (\n    ScheduledModifierManager,\n)\nfrom sparseml.pytorch.sparsification import (\n    EpochRangeModifier,\n    GMPruningModifier,\n)\nfrom sparseml.pytorch.utils import ModuleExporter\nfrom sparsify.blueprints.utils import (\n    default_epochs_distribution,\n    PruningModelEvaluator,\n    default_pruning_settings,\n)\nfrom sparsify.schemas import ProjectModelAnalysisSchema\nfrom torch.nn import CrossEntropyLoss, MSELoss\nfrom torch.optim import SGD\nfrom tqdm.auto import tqdm\n\nCRITERION_FNS = {\n    \"CrossEntropy\": CrossEntropyLoss(),\n    \"MSE\": MSELoss(),\n}\n\nlogging.basicConfig(\n    format=\" %(asctime)s [%(levelname)s] %(message)s\",\n    datefmt=\"%d/%m/%Y %I:%M:%S %p\",\n)\nlogger = logging.getLogger(\"nebullvm_logger\")\nlogger.setLevel(logging.INFO)\n\n\ndef _export_model_onnx(\n    model: torch.nn.Module,\n    save_path: Path,\n    model_name: str,\n    input_batch: Tuple,\n):\n    if torch.cuda.is_available():\n        input_batch = tuple(t.cuda() for t in input_batch)\n        model.cuda()\n\n    exporter = ModuleExporter(model, output_dir=save_path)\n    with torch.no_grad():\n        example_outputs = model(*input_batch)\n    exporter.export_onnx(\n        input_batch, name=model_name, example_outputs=example_outputs\n    )\n    onnx_path = save_path / model_name\n\n    return onnx_path\n\n\nclass RecipeBuilder:\n    def __init__(self, model_path):\n        self.model_path = model_path\n\n    def _make_analysis(self):\n        analyzer = ModelAnalyzer(self.model_path)\n        self.analysis = ProjectModelAnalysisSchema().load(analyzer.dict())\n\n    def _compute_loss_sensitivity(self):\n        sensitivities = []\n        parameters = []\n        for i, node in enumerate(self.analysis[\"nodes\"]):\n            if node[\"prunable\"]:\n                sensitivities.append(node[\"prunable_equation_sensitivity\"])\n                parameters.append(node[\"prunable_params\"])\n\n        loss_analysis = pruning_loss_sens_magnitude(self.model_path)\n\n        results_model = loss_analysis.results_model\n        results = loss_analysis.results\n\n        model = {\n            \"baseline_measurement_key\": (\n                str(results_model.baseline_measurement_key)\n            ),\n            \"measurements\": {\n                str(key): val for key, val in results_model.averages.items()\n            },\n        }\n        ops = []\n\n        for res in results:\n            ops.append(\n                {\n                    \"id\": res.id_,\n                    \"name\": res.name,\n                    \"index\": res.index,\n                    \"baseline_measurement_key\": (\n                        str(res.baseline_measurement_key)\n                    ),\n                    \"measurements\": {\n                        str(key): val for key, val in res.averages.items()\n                    },\n                }\n            )\n\n        pruning = {\"model\": model, \"ops\": ops}\n        loss = {}\n        loss[\"baseline\"] = {}\n        loss[\"pruning\"] = pruning\n\n        model = PruningModelEvaluator(\n            self.analysis,\n            None,\n            loss,\n        )\n        model.eval_baseline(default_pruning_settings().sparsity)\n        model.eval_pruning(default_pruning_settings())\n\n        self.final_analysis = model.to_dict_values()\n\n    def build_recipe(self, epochs_pruning_window=None, training_epochs=10):\n        self._make_analysis()\n        self._compute_loss_sensitivity()\n\n        if epochs_pruning_window is None:\n            epochs = default_epochs_distribution(training_epochs)\n        else:\n            # TODO: set custom parameters\n            epochs = default_epochs_distribution(training_epochs)\n            epochs_dict = epochs._asdict()\n            epochs_dict.update(epochs_pruning_window)\n            epochs = epochs.__class__(**epochs_dict)\n\n        mods = [\n            EpochRangeModifier(\n                start_epoch=epochs.start_epoch,\n                end_epoch=epochs.end_epoch,\n            )\n        ]\n\n        node_weight_name_lookup = {\n            node[\"id\"]: node[\"weight_name\"]\n            for node in self.analysis[\"nodes\"]\n            if node[\"prunable\"]\n        }\n\n        sparsity_to_params = {}\n\n        nodes = self.final_analysis[0]\n\n        for node in nodes:\n            sparsity = node[\"sparsity\"]\n            node_id = node[\"node_id\"]\n            weight_name = node_weight_name_lookup[node_id]\n\n            if sparsity is None:\n                continue\n\n            if sparsity not in sparsity_to_params:\n                sparsity_to_params[sparsity] = []\n\n            sparsity_to_params[sparsity].append(weight_name)\n\n        for sparsity, params in sparsity_to_params.items():\n            gm_pruning = GMPruningModifier(\n                init_sparsity=0.05,\n                final_sparsity=sparsity,\n                start_epoch=epochs.pruning_start_epoch,\n                end_epoch=epochs.pruning_end_epoch,\n                update_frequency=epochs.pruning_update_frequency,\n                params=params,\n            )\n\n            mods.append(gm_pruning)\n\n        return ScheduledModifierManager(mods)\n\n\nclass PruningTrainer:\n    def __init__(self, model, bs):\n        self.data_loader = None\n        self.optimizer = None\n        self.model = model\n        self.batch_size = bs\n\n    def _setup_training(self, loss_fn=None, lr=1e-3, momentum=0.9):\n        self.device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n        self.model.to(self.device)\n        if loss_fn is None:\n            loss_fn = CrossEntropyLoss()\n        else:\n            loss_fn = CRITERION_FNS.get(loss_fn, CrossEntropyLoss())\n        self.criterion = loss_fn\n        self.optimizer = SGD(self.model.parameters(), lr=lr, momentum=momentum)\n\n    def _run_model_one_epoch(self, train=False):\n\n        if train:\n            self.model.train()\n            data_loader = self.train_data_loader\n        else:\n            self.model.eval()\n            data_loader = self.val_data_loader\n\n        running_loss = 0.0\n\n        for step, (inputs, labels) in tqdm(\n            enumerate(data_loader), total=len(data_loader)\n        ):\n            inputs = tuple(t.to(self.device) for t in inputs)\n            if not isinstance(labels, torch.Tensor):\n                labels = torch.tensor(labels)\n                if len(labels.shape) == 0:\n                    labels = labels.unsqueeze(0)\n            labels = labels.to(self.device)\n\n            if train:\n                self.optimizer.zero_grad()\n\n            outputs = self.model(\n                *inputs\n            )  # model returns logits and softmax as a tuple\n            loss = self.criterion(outputs, labels)\n\n            if train:\n                loss.backward()\n                self.optimizer.step()\n\n            running_loss += loss.item()\n\n        loss = running_loss / (len(data_loader) + 1e-5)\n        return loss\n\n    def train(\n        self, manager, train_data_loader, val_data_loader, **train_kwargs\n    ):\n        self.train_data_loader = train_data_loader\n        self.val_data_loader = val_data_loader\n        self._setup_training(**train_kwargs)\n        self.optimizer = manager.modify(\n            self.model,\n            self.optimizer,\n            steps_per_epoch=len(self.train_data_loader),\n        )\n        self.model.train()\n        # Run model pruning\n        epoch = manager.min_epochs\n        while epoch < manager.max_epochs:\n            # run training loop\n            epoch_name = \"{}/{}\".format(epoch + 1, manager.max_epochs)\n            logger.info(\"Running Training Epoch {}\".format(epoch_name))\n            train_loss = self._run_model_one_epoch(train=True)\n            logger.info(\n                (\"Training Epoch: {}\\nTraining Loss: {}\\n\").format(\n                    epoch_name, train_loss\n                )\n            )\n\n            # run validation loop\n            logger.info(\"Running Validation Epoch {}\".format(epoch_name))\n            val_loss = self._run_model_one_epoch()\n            logger.info(\n                \"Validation Epoch: {}\\nVal Loss: {}\\n\".format(\n                    epoch_name, val_loss\n                )\n            )\n\n            epoch += 1\n\n        manager.finalize(self.model)\n\n        return self.model\n\n\ndef _load_config(config_file: str):\n    with open(config_file, \"r\") as f:\n        config = json.load(f)\n    return config\n\n\ndef _load_data(data_dir: str):\n    data_dir = Path(data_dir)\n    return [torch.load(input_path) for input_path in data_dir.glob(\"*.pt\")]\n\n\ndef _load_model(model_file: str):\n    if os.path.isdir(model_file):\n        path = Path(model_file)\n        module_file = path / \"module.py\"\n        with open(module_file, \"r\") as f:\n            module_str = f.read()\n        exec(module_str, globals())\n        model = eval(\"NebullvmFxModule\")()\n        model.load_state_dict(torch.load(path / \"state_dict.pt\"))\n    else:\n        model = torch.load(model_file)\n    return model\n\n\ndef _train_model(\n    model: torch.nn.Module,\n    train_data: List[Tuple[Tuple, Any]],\n    eval_data: List[Tuple[Tuple, Any]],\n    epochs_pruning_window: Dict = None,\n    training_epochs: int = 10,\n    lr: float = 1e-3,\n    momentum: float = 0.9,\n    loss_fn: str = \"CrossEntropy\",\n):\n    batch_size = train_data[0][0][0].shape[0]\n    with TemporaryDirectory() as tmp_dir:\n        onnx_path = _export_model_onnx(\n            model, Path(tmp_dir), \"model.onnx\", train_data[0][0]\n        )\n        onnx_path = onnx_path.as_posix()\n\n        recipe = RecipeBuilder(onnx_path)\n        # TODO: implement custom parameters support\n        manager = recipe.build_recipe(\n            epochs_pruning_window=epochs_pruning_window,\n            training_epochs=training_epochs,\n        )\n        trainer = PruningTrainer(model, batch_size)\n        pruned_model = trainer.train(\n            manager, train_data, eval_data, lr=lr, momentum=momentum\n        )\n        return pruned_model\n\n\ndef _save_model(model: torch.nn.Module, path: str):\n    if path.endswith(\".pt\"):\n        torch.save(model, path)\n    else:\n        torch.save(model.state_dict(), Path(path) / \"pruned_state_dict.pt\")\n\n\ndef main(\n    model_file: str,\n    train_data_dir: str,\n    eval_data_dir: str,\n    config_file: str,\n    out_file: str,\n):\n    config = _load_config(config_file)\n    model = _load_model(model_file)\n    train_data = _load_data(train_data_dir)\n    eval_data = _load_data(eval_data_dir)\n    pruned_model = _train_model(model, train_data, eval_data, **config)\n    _save_model(pruned_model, out_file)\n\n\nif __name__ == \"__main__\":\n    from argparse import ArgumentParser\n\n    parser = ArgumentParser()\n    parser.add_argument(\"--model\", help=\"The model to be pruned.\")\n    parser.add_argument(\n        \"--train_dir\",\n        help=\"The directory contained the pickled training data.\",\n    )\n    parser.add_argument(\n        \"--eval_dir\", help=\"The directory contained the pickled test data.\"\n    )\n    parser.add_argument(\"--config\", help=\"The config file.\")\n    parser.add_argument(\n        \"--pruned_model\", help=\"Path where storing the pruned model.\"\n    )\n    args = parser.parse_args()\n    main(\n        model_file=args.model,\n        train_data_dir=args.train_dir,\n        eval_data_dir=args.eval_dir,\n        config_file=args.config,\n        out_file=args.pruned_model,\n    )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/compressors/sparseml.py",
    "content": "import json\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Callable, Dict\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.operations.optimizations.compressors.base import Compressor\nfrom nebullvm.optional_modules.torch import torch, Module\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.pytorch import save_with_torch_fx, load_with_torch_fx\nfrom nebullvm.tools.venv import run_in_different_venv\n\n\ndef _save_model(model: Module, path: Path):\n    try:\n        save_with_torch_fx(model, path)\n    except Exception as ex:\n        logger.warning(\n            f\"Got an error while exporting with TorchFX. The model will be \"\n            f\"saved using the standard PyTorch save pickling method. Error \"\n            f\"got: {ex}\"\n        )\n        torch.save(model, path / \"model.pt\")\n        return path / \"model.pt\"\n    else:\n        return path\n\n\ndef _load_model(path: Path):\n    if path.is_file():\n        return torch.load(path)\n    else:\n        return load_with_torch_fx(path)\n\n\ndef _save_dataset(input_data: DataManager, path: Path):\n    path.mkdir(exist_ok=True)\n    for i, x in enumerate(input_data):\n        torch.save(x, path / f\"input_{i}.pt\")\n\n\ndef _save_json(dictionary: Dict, path: Path):\n    with open(path, \"w\") as f:\n        json.dump(dictionary, f)\n\n\ndef _write_requirements_file(path: Path):\n    requirements = \"sparseml\\nsparsify\\ntqdm\"\n    with open(path, \"w\") as f:\n        f.write(requirements)\n\n\nclass SparseMLCompressor(Compressor):\n    def execute(\n        self,\n        model: Module,\n        train_input_data: DataManager,\n        eval_input_data: DataManager,\n        metric_drop_ths: float,\n        metric: Callable,\n    ):\n        script_path = (\n            Path(__file__).parent / \"scripts/neural_magic_training.py\"\n        )\n        with TemporaryDirectory(dir=\"\") as tmp_dir:\n            tmp_dir = Path(tmp_dir)\n            requirements_file = tmp_dir / \"requirements.txt\"\n            model_path = _save_model(model, tmp_dir)\n            training_data_dir = tmp_dir / \"train\"\n            eval_data_dir = tmp_dir / \"eval\"\n            config_file = tmp_dir / \"config.json\"\n            pruned_model_path = (\n                tmp_dir / \"pruned_model.pt\"\n                if model_path.is_file()\n                else tmp_dir\n            )\n\n            _write_requirements_file(requirements_file)\n            _save_dataset(train_input_data, training_data_dir)\n            _save_dataset(eval_input_data, eval_data_dir)\n            _save_json(self._config, config_file)\n\n            run_in_different_venv(\n                str(requirements_file),\n                str(script_path),\n                torch.cuda.is_available(),\n                \"--model\",\n                f\"{model_path}\",\n                \"--train_dir\",\n                f\"{training_data_dir}\",\n                \"--eval_dir\",\n                f\"{eval_data_dir}\",\n                \"--config\",\n                f\"{config_file}\",\n                \"--pruned_model\",\n                f\"{pruned_model_path}\",\n            )\n\n            self.compressed_model = _load_model(pruned_model_path)\n\n            if self.compressed_model is not None:\n                error = self._compute_error(\n                    model, self.compressed_model, eval_input_data, metric\n                )\n                if error > metric_drop_ths:\n                    self.compressed_model = None\n                else:\n                    self.new_metric_ths = metric_drop_ths - error\n\n    @staticmethod\n    @torch.no_grad()\n    def _compute_error(\n        model: Module,\n        pruned_model: Module,\n        eval_input_data: DataManager,\n        metric: Callable,\n    ) -> float:\n        if len(eval_input_data) == 0:\n            return np.inf\n        metric_val = 0.0\n        model.eval()\n        pruned_model.eval()\n        for inputs, y in eval_input_data:\n            if torch.cuda.is_available():\n                inputs = tuple(data.cuda() for data in inputs)\n                pruned_model.cuda()\n                model.cuda()\n            model_pred = model(*inputs)\n            pruned_pred = pruned_model(*inputs)\n            metric_val += metric(model_pred, pruned_pred, y)\n        return metric_val / len(eval_input_data)\n\n    @staticmethod\n    def _get_default_config() -> Dict:\n        return {\n            \"training_epochs\": 10,\n            \"epochs_pruning_window\": {\"start_epoch\": 0, \"end_epoch\": 10},\n            \"loss_fn\": \"CrossEntropy\",\n            \"lr\": 1e-3,\n            \"momentum\": 0.9,\n        }\n\n    @property\n    def config_key(self) -> str:\n        return \"sparseml\"\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/optimize_inference.py",
    "content": "from pathlib import Path\nfrom tempfile import TemporaryDirectory\nfrom typing import Any, Iterable, Callable, List, Union, Dict, Optional\n\nfrom nebullvm.config import TRAIN_TEST_SPLIT_RATIO\nfrom nebullvm.core import types\nfrom nebullvm.core.models import (\n    OptimizeInferenceResult,\n    OriginalModel,\n    OptimizedModel,\n    BenchmarkOriginalModelResult,\n    ModelCompiler,\n    ModelCompressor,\n    OptimizationTime,\n    ModelParams,\n    DeepLearningFramework,\n)\nfrom nebullvm.operations.base import Operation\nfrom nebullvm.operations.conversions.utils import get_conversion_op\nfrom nebullvm.operations.measures.measures import LatencyOriginalModelMeasure\nfrom nebullvm.operations.measures.utils import QUANTIZATION_METRIC_MAP\nfrom nebullvm.operations.optimizations.optimizers.optimizers import (\n    PytorchOptimizer,\n    TensorflowOptimizer,\n    ONNXOptimizer,\n)\nfrom nebullvm.operations.optimizations.utils import (\n    map_compilers_and_compressors,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import DataLoader as TorchDataLoader\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.optional_modules.utils import (\n    check_dependencies,\n)\nfrom nebullvm.tools.adapters import (\n    ModelAdapter,\n    DiffusionAdapter,\n    HuggingFaceAdapter,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.diffusers import (\n    is_diffusion_model_pipe,\n    is_diffusion_model,\n)\nfrom nebullvm.tools.hardware_utils import get_hw_setup\nfrom nebullvm.tools.utils import (\n    is_huggingface_data,\n    check_input_data,\n    is_data_subscriptable,\n    get_dl_framework,\n    extract_info_from_data,\n    get_model_name,\n    get_model_size_mb,\n    get_throughput,\n)\n\n\nclass OptimizeInferenceOp(Operation):\n    def __init__(self):\n        super().__init__()\n        self.torch_optimization_op = PytorchOptimizer()\n        self.onnx_optimization_op = ONNXOptimizer()\n        self.tensorflow_optimization_op = TensorflowOptimizer()\n\n    @staticmethod\n    def _as_data_manager(data) -> DataManager:\n        if isinstance(data, DataManager):\n            return data\n        if check_input_data(data) is False:\n            raise ValueError(\n                \"The provided data does not match the expected \"\n                \"format.\\n\"\n                \"Speedster supports data in the following formats: \\n\"\n                \"- PyTorch DataLoader\\n\"\n                \"- TensorFlow Dataset\\n\"\n                \"- List of tuples: [((input_0, ... ), label), ...] \\n\"\n                \"Inputs and labels should be either tensors or numpy \"\n                \"arrays,\\n\"\n                \"depending on the framework used.\\n\"\n            )\n        if is_data_subscriptable(data):\n            return DataManager(data)\n        else:\n            return DataManager.from_iterable(data)\n\n    @staticmethod\n    def _check_inputs(model: Any, input_data: types.InputData):\n        if model is None:\n            raise ValueError(\"Input model cannot be None\")\n        if len(input_data) == 0:\n            raise ValueError(\"Input data cannot be empty\")\n\n    def execute(\n        self,\n        model: Any,\n        input_data: types.InputData,\n        metric_drop_ths: float = None,\n        metric: Union[str, Callable] = None,\n        optimization_time: str = \"constrained\",\n        dynamic_info: Dict = None,\n        config_file: str = None,\n        ignore_compilers: List[str] = None,\n        ignore_compressors: List[str] = None,\n        store_latencies: bool = False,\n        **kwargs,\n    ) -> OptimizeInferenceResult:\n\n        self._check_inputs(model, input_data)\n        check_dependencies(self.device)\n\n        ignore_compilers = map_compilers_and_compressors(\n            ignore_compilers, ModelCompiler\n        )\n        ignore_compressors = map_compilers_and_compressors(\n            ignore_compressors, ModelCompressor\n        )\n\n        optimization_time = OptimizationTime(optimization_time)\n\n        data = input_data\n\n        if isinstance(data, (TorchDataLoader, tf.data.Dataset)):\n            try:\n                data = DataManager.from_dataloader(data)\n            except Exception:\n                raise ValueError(\n                    \"The provided dataloader does not match the expected \"\n                    \"format.\\n\"\n                    \"Speedster supports dataloaders that return tuples in \"\n                    \"the\\n\"\n                    \"following formats: \\n\"\n                    \"Single input: (input,  label)\\n\"\n                    \"Multiple inputs: ((input1, input2, ...),  label) or \"\n                    \"(input1, input2, ...,  label)\\n\"\n                    \"Inputs and labels should be either tensors or numpy \"\n                    \"arrays,\\n\"\n                    \"depending on the framework used.\\n\"\n                )\n\n        # Setup adapters\n        model_adapter: Optional[ModelAdapter] = None\n        if is_diffusion_model_pipe(model):\n            self.logger.info(\n                \"The provided model is a diffusion model. \"\n                \"Speedster will optimize the UNet part of the model.\"\n            )\n            model_adapter = DiffusionAdapter(model, data, self.device)\n        elif is_huggingface_data(data[0]):\n            model_adapter = HuggingFaceAdapter(\n                model, data, self.device, **kwargs\n            )\n            if dynamic_info is None:\n                self.logger.warning(\n                    \"Dynamic shape info has not been provided for the \"\n                    \"HuggingFace model. The resulting optimized model \"\n                    \"will be usable only with a fixed input shape. \"\n                    \"To optimize the model for dynamic shapes, please \"\n                    \"look here: https://nebuly.gitbook.io/nebuly/modules/\"\n                    \"speedster/how-to-guides\"\n                    \"#using-dynamic-shape.\"\n                )\n\n        # Adapt data and model\n        if model_adapter is not None:\n            data = model_adapter.adapted_data\n            model = model_adapter.adapted_model\n\n        data = self._as_data_manager(data)\n        dl_framework = get_dl_framework(model)\n\n        if metric_drop_ths is not None and metric_drop_ths <= 0:\n            metric_drop_ths = None\n        elif metric_drop_ths is not None and metric is None:\n            metric = \"numeric_precision\"\n        if isinstance(metric, str):\n            metric = QUANTIZATION_METRIC_MAP.get(metric)\n\n        model_params: ModelParams = extract_info_from_data(\n            model=model,\n            input_data=data,\n            dl_framework=dl_framework,\n            dynamic_info=dynamic_info,\n            device=self.device,\n            is_diffusion=is_diffusion_model(model),\n        )\n\n        data.split(TRAIN_TEST_SPLIT_RATIO)\n\n        # -------- Benchmark original model --------\n        original_latency_op = LatencyOriginalModelMeasure().to(self.device)\n        orig_model_benchmark: BenchmarkOriginalModelResult = (\n            original_latency_op.execute(\n                model=model,\n                input_data=data.get_split(\"test\"),\n                dl_framework=dl_framework,\n            )\n        )\n        original_model = OriginalModel(\n            model=model,\n            latency_seconds=orig_model_benchmark.latency_seconds,\n            name=get_model_name(model),\n            size_mb=get_model_size_mb(model),\n            framework=dl_framework,\n            throughput=get_throughput(\n                latency=orig_model_benchmark.latency_seconds,\n                # Normal models have batch size B, diffusion\n                # models have batch size 2B\n                batch_size=model_params.batch_size\n                if not is_diffusion_model(model)\n                else model_params.batch_size / 2,\n            ),\n        )\n        # ------------------------------------------\n\n        with TemporaryDirectory() as tmp_dir:\n            tmp_dir = Path(tmp_dir) / \"fp32\"\n            tmp_dir.mkdir(parents=True, exist_ok=True)\n\n            # Convert model to all available frameworks\n            conversion_op = get_conversion_op(dl_framework)\n            conversion_op.to(self.device).set_state(model, data).execute(\n                save_path=tmp_dir,\n                model_params=model_params,\n            )\n\n            # Optimize models\n            optimized_models: List[OptimizedModel] = []\n            is_diffusion = is_diffusion_model(model)\n            for i, model in enumerate(conversion_op.get_result()):\n                optimized_models += self._optimize(\n                    model=model,\n                    input_data=data,\n                    model_outputs=orig_model_benchmark.model_outputs,\n                    optimization_time=optimization_time,\n                    metric_drop_ths=metric_drop_ths,\n                    metric=metric,\n                    model_params=model_params,\n                    ignore_compilers=ignore_compilers,\n                    ignore_compressors=ignore_compressors,\n                    source_dl_framework=dl_framework,\n                    pipeline_idx=i + 1,\n                    len_pipelines=len(conversion_op.get_result()),\n                    is_diffusion=is_diffusion,\n                )\n\n        optimized_models.sort(key=lambda x: x.latency_seconds, reverse=False)\n\n        # Check if at least one optimized model has been created\n        no_optimized_models = len(optimized_models) < 1\n        no_inference_learners = all(\n            o.inference_learner is None for o in optimized_models\n        )\n        if no_optimized_models or no_inference_learners:\n            self.logger.warning(\n                \"No optimized model has been created. This is likely \"\n                \"due to a bug during optimization. Please open an issue \"\n                \"and report in details your use case.\"\n            )\n\n        # Extract lowest-latency model\n        lowest_latency = self._extract_lowest_latency_model(optimized_models)\n\n        if model_adapter is not None:\n            original_model = model_adapter.adapt_original_model(original_model)\n            lowest_latency = model_adapter.adapt_inference_learner(\n                lowest_latency\n            )\n\n        return OptimizeInferenceResult(\n            original_model=original_model,\n            optimized_model=lowest_latency,\n            hardware_setup=get_hw_setup(),\n        )\n\n    def _optimize(\n        self,\n        model: Any,\n        model_outputs: Iterable,\n        input_data: types.InputData,\n        optimization_time: OptimizationTime,\n        metric_drop_ths: float,\n        metric: Callable,\n        model_params: ModelParams,\n        ignore_compilers: List[ModelCompiler],\n        ignore_compressors: List[ModelCompressor],\n        source_dl_framework: DeepLearningFramework,\n        pipeline_idx: int,\n        len_pipelines: int,\n        is_diffusion: bool,\n    ) -> List[OptimizedModel]:\n        if isinstance(model, torch.nn.Module):\n            optimization_op = self.torch_optimization_op\n            self.logger.info(\n                f\"[{pipeline_idx}/{len_pipelines}] Running PyTorch \"\n                f\"Optimization Pipeline\"\n            )\n        elif isinstance(model, tf.Module):\n            optimization_op = self.tensorflow_optimization_op\n            self.logger.info(\n                f\"[{pipeline_idx}/{len_pipelines}] Running TensorFlow \"\n                f\"Optimization Pipeline\"\n            )\n        else:\n            optimization_op = self.onnx_optimization_op\n            self.logger.info(\n                f\"[{pipeline_idx}/{len_pipelines}] Running ONNX \"\n                f\"Optimization Pipeline\"\n            )\n\n        # Run optimization\n        optimized_models = optimization_op.to(self.device).execute(\n            model=model,\n            input_data=input_data,\n            optimization_time=optimization_time,\n            metric_drop_ths=metric_drop_ths,\n            metric=metric,\n            model_params=model_params,\n            model_outputs=model_outputs,\n            ignore_compilers=ignore_compilers,\n            ignore_compressors=ignore_compressors,\n            source_dl_framework=source_dl_framework,\n            is_diffusion=is_diffusion,\n        )\n\n        if isinstance(model, torch.nn.Module):\n            optimization_op.free_model_gpu(model)\n\n        return optimized_models\n\n    @staticmethod\n    def _extract_lowest_latency_model(\n        models: List[OptimizedModel],\n    ) -> Optional[OptimizedModel]:\n        # fmt: off\n        inference_learner_models = [\n            m for m in models\n            if m.inference_learner is not None\n        ]\n        # fmt: on\n        if len(inference_learner_models) == 0:\n            return None\n        return min(inference_learner_models, key=lambda m: m.latency_seconds)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/optimizers/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/optimizers/base.py",
    "content": "import abc\nfrom tempfile import TemporaryDirectory\nfrom typing import Any, Callable, Dict, List, Tuple, Type, Union\n\nfrom nebullvm.config import ACTIVATION_METRIC_DROP_THS\nfrom nebullvm.core.models import (\n    OptimizedModel,\n    OptimizationTime,\n    ModelParams,\n    ModelCompiler,\n    ModelCompressor,\n    DeepLearningFramework,\n    DeviceType,\n    QuantizationType,\n)\nfrom nebullvm.operations.base import Operation\nfrom nebullvm.operations.inference_learners.base import (\n    BuildInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.builders import (\n    DeepSparseBuildInferenceLearner,\n    FasterTransformerBuildInferenceLearner,\n    IntelNeuralCompressorBuildInferenceLearner,\n    ONNXApacheTVMBuildInferenceLearner,\n    ONNXBuildInferenceLearner,\n    ONNXTensorRTBuildInferenceLearner,\n    OpenVINOBuildInferenceLearner,\n    PyTorchApacheTVMBuildInferenceLearner,\n    PyTorchTensorRTBuildInferenceLearner,\n    TensorflowBuildInferenceLearner,\n    TFLiteBuildInferenceLearner,\n    TorchNeuronBuildInferenceLearner,\n    TorchXLABuildInferenceLearner,\n    TorchDynamoBuildInferenceLearner,\n    TorchScriptBuildInferenceLearner,\n)\nfrom nebullvm.operations.measures.measures import MetricDropMeasure\nfrom nebullvm.operations.measures.utils import (\n    compute_optimized_running_time,\n    compute_relative_difference,\n)\nfrom nebullvm.operations.optimizations.compilers.base import Compiler\nfrom nebullvm.operations.optimizations.compilers.deepsparse import (\n    DeepSparseCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.faster_transformer import (\n    FasterTransformerCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.intel_neural_compressor import (  # noqa: E501\n    IntelNeuralCompressorCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.onnxruntime import (\n    ONNXCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.openvino import (\n    OpenVINOCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.tensor_rt import (\n    ONNXTensorRTCompiler,\n    PyTorchTensorRTCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.tensorflow import (\n    TensorflowBackendCompiler,\n    TFLiteBackendCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.torch_dynamo import (\n    TorchDynamoCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.torch_neuron import (\n    TorchNeuronCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.torch_xla import (\n    TorchXLACompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.torchscript import (\n    TorchScriptCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.tvm import (\n    ONNXApacheTVMCompiler,\n    PyTorchApacheTVMCompiler,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\nfrom nebullvm.tools.utils import get_throughput\n\n\nclass Optimizer(Operation, abc.ABC):\n    def __init__(self):\n        super().__init__()\n        self.optimized_models = []\n        self.source_dl_framework = None\n        self.pipeline_dl_framework = None\n        self.compiler_ops = {}\n        self.build_inference_learner_ops = {}\n        self.validity_check_op = MetricDropMeasure()\n\n    def execute(\n        self,\n        model: Any,\n        input_data: DataManager,\n        optimization_time: OptimizationTime,\n        metric_drop_ths: float,\n        metric: Callable,\n        model_params: ModelParams,\n        model_outputs: List[Tuple[Any, ...]],\n        ignore_compilers: List[ModelCompiler],\n        ignore_compressors: List[ModelCompressor],\n        source_dl_framework: DeepLearningFramework,\n        is_diffusion: bool = False,\n    ) -> List[OptimizedModel]:\n        self.source_dl_framework = source_dl_framework\n\n        # TODO: implement and select compressors from hardware\n\n        compilers = self._select_compilers_from_hardware()\n\n        remove_compiler_list = []\n        add_compiler_list = []\n        for compiler in ignore_compilers:\n            if compiler in MULTI_FRAMEWORK_COMPILERS:\n                add_compiler_list += MULTI_FRAMEWORK_COMPILERS[compiler]\n                remove_compiler_list.append(compiler)\n\n        for c in remove_compiler_list:\n            ignore_compilers.remove(c)\n\n        ignore_compilers += add_compiler_list\n\n        (\n            self.compiler_ops,\n            self.build_inference_learner_ops,\n        ) = self._load_compilers(\n            ignore_compilers=ignore_compilers,\n            compilers=compilers,\n        )\n        self._optimize(\n            model=model,\n            input_data=input_data,\n            optimization_time=optimization_time,\n            metric_drop_ths=metric_drop_ths,\n            metric=metric,\n            model_params=model_params,\n            model_outputs=model_outputs,\n            ignore_compilers=ignore_compilers,\n            is_diffusion=is_diffusion,\n        )\n\n        return self.optimized_models\n\n    @abc.abstractmethod\n    def _select_compilers_from_hardware(self):\n        raise NotImplementedError()\n\n    @staticmethod\n    def _load_compilers(\n        ignore_compilers: List[ModelCompiler],\n        compilers: List[ModelCompiler],\n    ):\n        compiler_ops = {\n            compiler: COMPILER_TO_OPTIMIZER_MAP[compiler]()\n            for compiler in compilers\n            if compiler not in ignore_compilers\n            and compiler in COMPILER_TO_OPTIMIZER_MAP\n        }\n        build_inference_learner_ops = {\n            compiler: COMPILER_TO_INFERENCE_LEARNER_MAP[compiler]()\n            for compiler in compilers\n            if compiler not in ignore_compilers\n            and compiler in COMPILER_TO_OPTIMIZER_MAP\n        }\n\n        return compiler_ops, build_inference_learner_ops\n\n    def free_model_gpu(self, model: Any):\n        # Free gpu memory\n        if self.device.type is DeviceType.GPU:\n            try:\n                model.cpu()\n            except Exception:\n                pass\n            try:\n                with torch.cuda.device(self.device.to_torch_format()):\n                    torch.cuda.empty_cache()\n            except Exception:\n                pass\n\n    def _optimize(\n        self,\n        model: Union[torch.nn.Module, tf.Module, str],\n        input_data: DataManager,\n        optimization_time: OptimizationTime,\n        metric_drop_ths: float,\n        metric: Callable,\n        model_params: ModelParams,\n        model_outputs: List[Tuple[Any, ...]],\n        ignore_compilers: List[ModelCompiler],\n        is_diffusion: bool = False,\n    ):\n\n        if metric_drop_ths is not None:\n            q_types = [\n                None,\n            ]\n            if metric_drop_ths > 0:\n                q_types.append(QuantizationType.HALF)\n            if metric_drop_ths > ACTIVATION_METRIC_DROP_THS:\n                q_types.append(QuantizationType.DYNAMIC)\n                if input_data is not None:\n                    q_types.append(QuantizationType.STATIC)\n        else:\n            q_types = [None]\n\n        optimization_info = []\n        for compiler, compiler_op, build_inference_learner_op in zip(\n            self.compiler_ops.keys(),\n            self.compiler_ops.values(),\n            self.build_inference_learner_ops.values(),\n        ):\n            for q_type in q_types:\n                input_tfms = MultiStageTransformation([])\n\n                self.free_model_gpu(model)\n\n                with TemporaryDirectory() as tmp_dir:\n                    try:\n                        compiler_op.to(self.device).execute(\n                            model=model,\n                            input_data=input_data,\n                            model_params=model_params,\n                            metric_drop_ths=metric_drop_ths\n                            if q_type is not None\n                            else None,\n                            quantization_type=q_type,\n                            input_tfms=input_tfms,\n                            onnx_output_path=tmp_dir,\n                            is_diffusion=is_diffusion,\n                        )\n\n                        compiled_model = compiler_op.get_result()\n                        if compiled_model is not None:\n                            build_inference_learner_op.to(self.device).execute(\n                                model=compiled_model,\n                                model_orig=compiler_op.model_orig\n                                if hasattr(compiler_op, \"model_orig\")\n                                else None,\n                                model_params=model_params,\n                                input_tfms=input_tfms,\n                                source_dl_framework=self.source_dl_framework,\n                                quantization_type=q_type,\n                            )\n                            inference_learner = (\n                                build_inference_learner_op.get_result()\n                            )\n\n                            if inference_learner is not None:\n                                test_input_data, ys = input_data.get_split(\n                                    \"test\"\n                                ).get_list(with_ys=True)\n\n                                self.validity_check_op.execute(\n                                    inference_learner,\n                                    test_input_data,\n                                    model_outputs,\n                                    metric_drop_ths,\n                                    metric_func=metric\n                                    if q_type is not None\n                                    else compute_relative_difference,\n                                    ys=ys,\n                                )\n\n                                if self.validity_check_op.valid:\n                                    latency = compute_optimized_running_time(\n                                        inference_learner, input_data\n                                    )\n                                    self.logger.info(\n                                        f\"Optimized model latency: {latency} \"\n                                        f\"sec/iter\"\n                                    )\n\n                                    if (\n                                        compiler not in ignore_compilers\n                                        and optimization_time\n                                        is OptimizationTime.CONSTRAINED\n                                    ):\n                                        ignore_compilers.append(compiler)\n\n                                    self.optimized_models.append(\n                                        OptimizedModel(\n                                            inference_learner=inference_learner,  # noqa: E501\n                                            metric_drop=self.validity_check_op.measure_result,  # noqa: E501\n                                            compiler=compiler,\n                                            technique=q_type.name\n                                            if q_type is not None\n                                            else \"None\",\n                                            latency_seconds=latency,\n                                            throughput=get_throughput(\n                                                latency,\n                                                # Normal models have batch\n                                                # size B, diffusion models\n                                                # have batch size 2B\n                                                model_params.batch_size\n                                                if not is_diffusion\n                                                else model_params.batch_size\n                                                / 2,\n                                            ),\n                                            size_mb=inference_learner.get_size()  # noqa: E501\n                                            / 1e6,\n                                        )\n                                    )\n\n                                    opt_info_dict = {\n                                        \"compiler\": f\"{self.pipeline_dl_framework.value}_{compiler.value}\",  # noqa: E501\n                                        \"technique\": q_type.value\n                                        if q_type\n                                        else \"none\",\n                                        \"latency\": latency,\n                                    }\n                                    if (\n                                        metric_drop_ths is not None\n                                        and q_type is not None\n                                    ):\n                                        opt_info_dict[\n                                            \"metric_loss\"\n                                        ] = (\n                                            self.validity_check_op.measure_result  # noqa: E501\n                                        )\n                                        opt_info_dict[\n                                            \"metric\"\n                                        ] = metric.__name__\n                                    optimization_info.append(opt_info_dict)\n                                else:\n                                    self.logger.warning(\n                                        \"The optimized model will be \"\n                                        \"discarded due to poor results \"\n                                        \"obtained with the given metric.\"\n                                    )\n\n                                if self.device.type in [\n                                    DeviceType.GPU,\n                                    DeviceType.TPU,\n                                ]:\n                                    inference_learner.free_gpu_memory()\n                    except Exception as ex:\n                        self.logger.warning(\n                            f\"Optimization failed with \"\n                            f\"{self.pipeline_dl_framework} \"\n                            f\"interface of {compiler}. Got error {ex}. \"\n                            f\"If possible the compilation will be re-scheduled\"\n                            f\" with another interface. Please consult the \"\n                            f\"documentation for further info or open an issue \"\n                            f\"on GitHub for receiving assistance.\"\n                        )\n                        optimization_info.append(\n                            {\n                                \"compiler\": compiler.value,\n                                \"technique\": q_type.value\n                                if q_type\n                                else \"none\",\n                                \"latency\": -1,\n                            }\n                        )\n        if self.feedback_collector is not None:\n            self.feedback_collector.store_info(\n                key=\"optimizations\",\n                value=optimization_info,\n            )\n\n\nMULTI_FRAMEWORK_COMPILERS = {\n    ModelCompiler.TENSOR_RT: [\n        ModelCompiler.TENSOR_RT_TORCH,\n        ModelCompiler.TENSOR_RT_ONNX,\n    ],\n    ModelCompiler.APACHE_TVM: [\n        ModelCompiler.APACHE_TVM_TORCH,\n        ModelCompiler.APACHE_TVM_ONNX,\n    ],\n}\n\nCOMPILER_TO_OPTIMIZER_MAP: Dict[ModelCompiler, Type[Compiler]] = {\n    ModelCompiler.TORCHSCRIPT: TorchScriptCompiler,\n    ModelCompiler.DEEPSPARSE: DeepSparseCompiler,\n    ModelCompiler.INTEL_NEURAL_COMPRESSOR: IntelNeuralCompressorCompiler,\n    ModelCompiler.TENSOR_RT_TORCH: PyTorchTensorRTCompiler,\n    ModelCompiler.TENSOR_RT_ONNX: ONNXTensorRTCompiler,\n    ModelCompiler.APACHE_TVM_TORCH: PyTorchApacheTVMCompiler,\n    ModelCompiler.APACHE_TVM_ONNX: ONNXApacheTVMCompiler,\n    ModelCompiler.ONNX_RUNTIME: ONNXCompiler,\n    ModelCompiler.OPENVINO: OpenVINOCompiler,\n    ModelCompiler.TFLITE: TFLiteBackendCompiler,\n    ModelCompiler.XLA: TensorflowBackendCompiler,\n    ModelCompiler.TORCH_NEURON: TorchNeuronCompiler,\n    ModelCompiler.TORCH_XLA: TorchXLACompiler,\n    ModelCompiler.TORCH_DYNAMO: TorchDynamoCompiler,\n    ModelCompiler.FASTER_TRANSFORMER: FasterTransformerCompiler,\n}\n\nCOMPILER_TO_INFERENCE_LEARNER_MAP: Dict[\n    ModelCompiler, Type[BuildInferenceLearner]\n] = {\n    ModelCompiler.TORCHSCRIPT: TorchScriptBuildInferenceLearner,\n    ModelCompiler.DEEPSPARSE: DeepSparseBuildInferenceLearner,\n    ModelCompiler.INTEL_NEURAL_COMPRESSOR: IntelNeuralCompressorBuildInferenceLearner,  # noqa: E501\n    ModelCompiler.TENSOR_RT_TORCH: PyTorchTensorRTBuildInferenceLearner,\n    ModelCompiler.TENSOR_RT_ONNX: ONNXTensorRTBuildInferenceLearner,\n    ModelCompiler.APACHE_TVM_TORCH: PyTorchApacheTVMBuildInferenceLearner,\n    ModelCompiler.APACHE_TVM_ONNX: ONNXApacheTVMBuildInferenceLearner,\n    ModelCompiler.ONNX_RUNTIME: ONNXBuildInferenceLearner,\n    ModelCompiler.OPENVINO: OpenVINOBuildInferenceLearner,\n    ModelCompiler.TFLITE: TFLiteBuildInferenceLearner,\n    ModelCompiler.XLA: TensorflowBuildInferenceLearner,\n    ModelCompiler.TORCH_NEURON: TorchNeuronBuildInferenceLearner,\n    ModelCompiler.TORCH_XLA: TorchXLABuildInferenceLearner,\n    ModelCompiler.TORCH_DYNAMO: TorchDynamoBuildInferenceLearner,\n    ModelCompiler.FASTER_TRANSFORMER: FasterTransformerBuildInferenceLearner,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/optimizers/optimizers.py",
    "content": "import platform\n\nfrom nebullvm.core.models import (\n    DeepLearningFramework,\n    DeviceType,\n    ModelCompiler,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import Optimizer\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    tvm_is_available,\n    bladedisc_is_available,\n    deepsparse_is_available,\n    intel_neural_compressor_is_available,\n    torch_tensorrt_is_available,\n    onnxruntime_is_available,\n    tensorrt_is_available,\n    openvino_is_available,\n    torch_neuron_is_available,\n    torch_xla_is_available,\n    faster_transformer_is_available,\n)\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.optional_modules.utils import (\n    torch_is_available,\n    tensorflow_is_available,\n    onnx_is_available,\n)\nfrom nebullvm.tools.utils import check_module_version\n\n\nclass PytorchOptimizer(Optimizer):\n    def __init__(self):\n        super().__init__()\n        self.pipeline_dl_framework = DeepLearningFramework.PYTORCH\n\n    def _select_compilers_from_hardware(self):\n        compilers = []\n        if torch_is_available():\n            if self.device.type is DeviceType.TPU:\n                if torch_xla_is_available():\n                    compilers.append(ModelCompiler.TORCH_XLA)\n                else:\n                    raise RuntimeError(\n                        \"Torch XLA is not available on your platform. \"\n                        \"Please install torch-xla the readme at this \"\n                        \"link: https://github.com/pytorch/xla\"\n                    )\n            elif self.device.type is DeviceType.NEURON:\n                if torch_neuron_is_available():\n                    compilers.append(ModelCompiler.TORCH_NEURON)\n                else:\n                    raise RuntimeError(\n                        \"Torch Neuron is not available on your platform. \"\n                        \"Please install torch-neuron by following \"\n                        \"this guide: https://awsdocs-neuron\"\n                        \".readthedocs-hosted.com/en/latest/general/\"\n                        \"quick-start/torch-neuron.html.\"\n                    )\n            else:\n                compilers.append(ModelCompiler.TORCHSCRIPT)\n                if (\n                    check_module_version(torch, min_version=\"2.0.0\")\n                    and platform.system() != \"Windows\"\n                    and False\n                ):  # Deactivated because save and load methods are\n                    # not implemented\n                    compilers.append(ModelCompiler.TORCH_DYNAMO)\n                if tvm_is_available():\n                    compilers.append(ModelCompiler.APACHE_TVM_TORCH)\n                if bladedisc_is_available():\n                    compilers.append(ModelCompiler.BLADEDISC)\n\n                if self.device.type is DeviceType.CPU:\n                    if deepsparse_is_available():\n                        compilers.append(ModelCompiler.DEEPSPARSE)\n                    if intel_neural_compressor_is_available():\n                        compilers.append(ModelCompiler.INTEL_NEURAL_COMPRESSOR)\n                elif self.device.type is DeviceType.GPU:\n                    if torch_tensorrt_is_available():\n                        compilers.append(ModelCompiler.TENSOR_RT_TORCH)\n                    if faster_transformer_is_available():\n                        compilers.append(ModelCompiler.FASTER_TRANSFORMER)\n        return compilers\n\n\nclass TensorflowOptimizer(Optimizer):\n    def __init__(self):\n        super().__init__()\n        self.pipeline_dl_framework = DeepLearningFramework.TENSORFLOW\n\n    def _select_compilers_from_hardware(self):\n        compilers = []\n        if tensorflow_is_available():\n            compilers.append(ModelCompiler.XLA)\n            compilers.append(ModelCompiler.TFLITE)\n        return compilers\n\n\nclass ONNXOptimizer(Optimizer):\n    def __init__(self):\n        super().__init__()\n        self.pipeline_dl_framework = DeepLearningFramework.NUMPY\n\n    def _select_compilers_from_hardware(self):\n        compilers = []\n        if onnx_is_available():\n            if onnxruntime_is_available():\n                compilers.append(ModelCompiler.ONNX_RUNTIME)\n            if tvm_is_available():\n                compilers.append(ModelCompiler.APACHE_TVM_ONNX)\n            if self.device.type is DeviceType.GPU and tensorrt_is_available():\n                compilers.append(ModelCompiler.TENSOR_RT_ONNX)\n            if self.device.type is DeviceType.CPU and openvino_is_available():\n                compilers.append(ModelCompiler.OPENVINO)\n        return compilers\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_deepsparse.py",
    "content": "from tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\n\nfrom nebullvm.config import CONSTRAINED_METRIC_DROP_THS\nfrom nebullvm.core.models import (\n    Device,\n    DeviceType,\n    DeepLearningFramework,\n    ModelCompiler,\n)\nfrom nebullvm.operations.inference_learners.deepsparse import (\n    DEEPSPARSE_INFERENCE_LEARNERS,\n)\nfrom nebullvm.operations.measures.measures import MetricDropMeasure\nfrom nebullvm.operations.measures.utils import compute_relative_difference\nfrom nebullvm.operations.optimizations.compilers.deepsparse import (\n    DeepSparseCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    deepsparse_is_available,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import initialize_model\nfrom nebullvm.operations.inference_learners.utils import load_model\n\ndevice = Device(DeviceType.CPU)\n\n\n@pytest.mark.parametrize(\n    (\"output_library\", \"dynamic\"),\n    [\n        # (DeepLearningFramework.PYTORCH, True),\n        (DeepLearningFramework.PYTORCH, False),\n    ],\n)\n@pytest.mark.skipif(\n    not deepsparse_is_available(),\n    reason=\"Can't test deepsparse if it's not installed.\",\n)\ndef test_deepsparse(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type=None,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, None, output_library, device)\n\n        compiler_op = DeepSparseCompiler()\n        compiler_op.to(device).execute(\n            model=model,\n            onnx_output_path=tmp_dir,\n            model_params=model_params,\n            quantization_type=None,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.DEEPSPARSE\n        ]()\n\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(\n            optimized_model, DEEPSPARSE_INFERENCE_LEARNERS[output_library]\n        )\n        assert isinstance(optimized_model.get_size(), int)\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(\n            loaded_model, DEEPSPARSE_INFERENCE_LEARNERS[output_library]\n        )\n\n        inputs_example = optimized_model.get_inputs_example()\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        test_input_data, ys = input_data.get_split(\"test\").get_list(\n            with_ys=True\n        )\n\n        validity_check_op = MetricDropMeasure()\n        validity_check_op.execute(\n            optimized_model,\n            test_input_data,\n            model_outputs,\n            CONSTRAINED_METRIC_DROP_THS,\n            metric_func=metric\n            if quantization_type is not None\n            else compute_relative_difference,\n            ys=ys,\n        )\n\n        # Check validity of the optimized model\n        assert validity_check_op.get_result()\n\n        # Dynamic batch size is currently not supported from deepsparse\n        # if dynamic:\n        #     inputs_example = [\n        #         input_[: len(input_) // 2] for input_ in inputs_example\n        #     ]\n        #     res = model(*inputs_example)\n        #     assert res is not None\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_intel_neural_compressor.py",
    "content": "from tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\n\nfrom nebullvm.core.models import (\n    DeviceType,\n    Device,\n    QuantizationType,\n    DeepLearningFramework,\n    ModelCompiler,\n)\nfrom nebullvm.operations.inference_learners.neural_compressor import (\n    NEURAL_COMPRESSOR_INFERENCE_LEARNERS,\n)\nfrom nebullvm.operations.optimizations.compilers.intel_neural_compressor import (  # noqa: E501\n    IntelNeuralCompressorCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    intel_neural_compressor_is_available,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import (\n    initialize_model,\n    check_model_validity,\n)\nfrom nebullvm.operations.inference_learners.utils import load_model\n\ndevice = Device(DeviceType.CPU)\n\n\n@pytest.mark.parametrize(\n    (\"output_library\", \"dynamic\", \"metric_drop_ths\", \"quantization_type\"),\n    [\n        (DeepLearningFramework.PYTORCH, True, 2, QuantizationType.DYNAMIC),\n        (DeepLearningFramework.PYTORCH, False, 2, QuantizationType.DYNAMIC),\n        (DeepLearningFramework.PYTORCH, True, 2, QuantizationType.STATIC),\n        (DeepLearningFramework.PYTORCH, False, 2, QuantizationType.STATIC),\n    ],\n)\n@pytest.mark.skipif(\n    not intel_neural_compressor_is_available(),\n    reason=\"Can't test neural compressor if it's not installed.\",\n)\ndef test_neural_compressor(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    metric_drop_ths: float,\n    quantization_type: QuantizationType,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, None, output_library, device)\n\n        compiler_op = IntelNeuralCompressorCompiler()\n        compiler_op.to(device).execute(\n            model=model,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.INTEL_NEURAL_COMPRESSOR\n        ]()\n\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n\n        assert isinstance(\n            optimized_model,\n            NEURAL_COMPRESSOR_INFERENCE_LEARNERS[output_library],\n        )\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(\n            loaded_model, NEURAL_COMPRESSOR_INFERENCE_LEARNERS[output_library]\n        )\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = optimized_model.get_inputs_example()\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:\n            inputs_example = [\n                input_[: len(input_) // 2] for input_ in inputs_example\n            ]\n            res = model(*inputs_example)\n            assert res is not None\n\n            res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(res_tensor, res_orig_tensor, rtol=1e-01)\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_onnxruntime.py",
    "content": "import sys\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\n\nimport onnx\nimport pytest\nimport torch\n\nfrom nebullvm.core.models import (\n    Device,\n    DeviceType,\n    DeepLearningFramework,\n    QuantizationType,\n    ModelCompiler,\n)\nfrom nebullvm.operations.conversions.converters import PytorchConverter\nfrom nebullvm.operations.inference_learners.onnx import ONNX_INFERENCE_LEARNERS\nfrom nebullvm.operations.optimizations.compilers.onnxruntime import (\n    ONNXCompiler,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import (\n    initialize_model,\n    check_model_validity,\n)\nfrom nebullvm.operations.inference_learners.utils import load_model\nfrom nebullvm.tools.utils import gpu_is_available\n\ndevice = (\n    Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)\n)\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n        \"external_data_format\",\n    ),\n    [\n        (DeepLearningFramework.PYTORCH, True, None, None, None, True),\n        (DeepLearningFramework.PYTORCH, True, None, None, None, False),\n        (DeepLearningFramework.PYTORCH, False, None, None, None, False),\n    ],\n)\ndef test_onnxruntime(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n    external_data_format: bool,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        model_path = Path(tmp_dir) / \"fp32\"\n        model_path.mkdir(parents=True)\n\n        converter_op = PytorchConverter()\n        converter_op.to(device).set_state(model, input_data).execute(\n            model_path, model_params\n        )\n\n        converted_models = converter_op.get_result()\n        assert len(converted_models) > 1\n\n        model_path = str(\n            [model for model in converted_models if isinstance(model, Path)][0]\n        )\n\n        # Test onnx external data format (large models)\n        if external_data_format:\n            onnx_model = onnx.load(model_path)\n            onnx.save_model(\n                onnx_model,\n                model_path,\n                save_as_external_data=True,\n                all_tensors_to_one_file=False,\n            )\n\n        compiler_op = ONNXCompiler()\n        compiler_op.to(device).execute(\n            model=model_path,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.ONNX_RUNTIME\n        ]()\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n            quantization_type=quantization_type,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(\n            optimized_model, ONNX_INFERENCE_LEARNERS[output_library]\n        )\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(\n            loaded_model, ONNX_INFERENCE_LEARNERS[output_library]\n        )\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = list(optimized_model.get_inputs_example())\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            torch_device = torch.device(\n                \"cuda\" if torch.cuda.is_available() else \"cpu\"\n            )\n\n            inputs_example = [\n                input_[: len(input_) // 2].to(torch_device)\n                for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            with torch.inference_mode():\n                res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(res_tensor, res_orig_tensor, rtol=2e-01)\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n        \"external_data_format\",\n    ),\n    [\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.DYNAMIC,\n            2,\n            \"numeric_precision\",\n            False,\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.STATIC,\n            2,\n            \"numeric_precision\",\n            False,\n        ),\n    ],\n)\n@pytest.mark.skipif(\n    torch.cuda.is_available(),\n    reason=\"onnxruntime with int8 precision is very slow on GPU\",\n)\ndef test_onnxruntime_quantization(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n    external_data_format: bool,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        model_path = Path(tmp_dir) / \"fp32\"\n        model_path.mkdir(parents=True)\n\n        converter_op = PytorchConverter()\n        converter_op.to(device).set_state(model, input_data).execute(\n            model_path, model_params\n        )\n\n        converted_models = converter_op.get_result()\n        assert len(converted_models) > 1\n\n        model_path = str(\n            [model for model in converted_models if isinstance(model, Path)][0]\n        )\n\n        # Test onnx external data format (large models)\n        if external_data_format:\n            onnx_model = onnx.load(model_path)\n            onnx.save_model(\n                onnx_model,\n                model_path,\n                save_as_external_data=True,\n                all_tensors_to_one_file=False,\n            )\n\n        compiler_op = ONNXCompiler()\n        compiler_op.to(device).execute(\n            model=model_path,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.ONNX_RUNTIME\n        ]()\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n            quantization_type=quantization_type,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(\n            optimized_model, ONNX_INFERENCE_LEARNERS[output_library]\n        )\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(\n            loaded_model, ONNX_INFERENCE_LEARNERS[output_library]\n        )\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = list(optimized_model.get_inputs_example())\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            torch_device = torch.device(\n                \"cuda\" if torch.cuda.is_available() else \"cpu\"\n            )\n\n            inputs_example = [\n                input_[: len(input_) // 2].to(torch_device)\n                for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            with torch.inference_mode():\n                res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(res_tensor, res_orig_tensor, rtol=2e-01)\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n        \"external_data_format\",\n    ),\n    [\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n            False,\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n            True,\n        ),\n    ],\n)\n@pytest.mark.skipif(\n    sys.platform == \"win32\",\n    reason=\"onnxruntime with half precision on windows does not work\",\n)\n@pytest.mark.skipif(\n    not torch.cuda.is_available(),\n    reason=\"onnxruntime with half precision is very slow on CPU\",\n)\ndef test_onnxruntime_half(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n    external_data_format: bool,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        model_path = Path(tmp_dir) / \"fp32\"\n        model_path.mkdir(parents=True)\n\n        converter_op = PytorchConverter()\n        converter_op.to(device).set_state(model, input_data).execute(\n            model_path, model_params\n        )\n\n        converted_models = converter_op.get_result()\n        assert len(converted_models) > 1\n\n        model_path = str(\n            [model for model in converted_models if isinstance(model, Path)][0]\n        )\n\n        # Test onnx external data format (large models)\n        if external_data_format:\n            onnx_model = onnx.load(model_path)\n            onnx.save_model(\n                onnx_model,\n                model_path,\n                save_as_external_data=True,\n                all_tensors_to_one_file=False,\n            )\n\n        compiler_op = ONNXCompiler()\n        compiler_op.to(device).execute(\n            model=model_path,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.ONNX_RUNTIME\n        ]()\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n            quantization_type=quantization_type,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(\n            optimized_model, ONNX_INFERENCE_LEARNERS[output_library]\n        )\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = ONNX_INFERENCE_LEARNERS[output_library].load(tmp_dir)\n        assert isinstance(\n            loaded_model, ONNX_INFERENCE_LEARNERS[output_library]\n        )\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = list(optimized_model.get_inputs_example())\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            torch_device = torch.device(\n                \"cuda\" if torch.cuda.is_available() else \"cpu\"\n            )\n\n            inputs_example = [\n                input_[: len(input_) // 2].to(torch_device)\n                for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            with torch.inference_mode():\n                res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(\n                        res_tensor.float(), res_orig_tensor, rtol=1e-01\n                    )\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_openvino.py",
    "content": "from pathlib import Path\nfrom tempfile import TemporaryDirectory\n\nimport cpuinfo\nimport pytest\nimport torch\n\nfrom nebullvm.core.models import (\n    DeepLearningFramework,\n    QuantizationType,\n    Device,\n    DeviceType,\n    ModelCompiler,\n)\nfrom nebullvm.operations.conversions.converters import PytorchConverter\nfrom nebullvm.operations.inference_learners.openvino import (\n    OPENVINO_INFERENCE_LEARNERS,\n)\nfrom nebullvm.operations.optimizations.compilers.openvino import (\n    OpenVINOCompiler,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import (\n    initialize_model,\n    check_model_validity,\n)\nfrom nebullvm.operations.inference_learners.utils import load_model\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (DeepLearningFramework.PYTORCH, True, None, None, None),\n        (DeepLearningFramework.PYTORCH, False, None, None, None),\n        (\n            DeepLearningFramework.PYTORCH,\n            False,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            False,\n            QuantizationType.STATIC,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.STATIC,\n            2,\n            \"numeric_precision\",\n        ),\n    ],\n)\n@pytest.mark.skipif(\n    \"intel\" not in cpuinfo.get_cpu_info()[\"brand_raw\"].lower(),\n    reason=\"Openvino is only available for intel processors.\",\n)\ndef test_openvino(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    device = Device(DeviceType.CPU)\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        model_path = Path(tmp_dir) / \"fp32\"\n        model_path.mkdir(parents=True)\n\n        converter_op = PytorchConverter()\n        converter_op.to(device).set_state(model, input_data).execute(\n            model_path, model_params\n        )\n\n        converted_models = converter_op.get_result()\n        assert len(converted_models) > 1\n\n        model_path = str(\n            [model for model in converted_models if isinstance(model, Path)][0]\n        )\n\n        compiler_op = OpenVINOCompiler()\n        compiler_op.to(device).execute(\n            model=model_path,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.OPENVINO\n        ]()\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(\n            optimized_model, OPENVINO_INFERENCE_LEARNERS[output_library]\n        )\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(\n            loaded_model, OPENVINO_INFERENCE_LEARNERS[output_library]\n        )\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = list(optimized_model.get_inputs_example())\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            inputs_example = [\n                input_[: len(input_) // 2] for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(\n                        res_tensor.float(), res_orig_tensor, rtol=2e-01\n                    )\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tensor_rt.py",
    "content": "from pathlib import Path\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\n\nfrom nebullvm.core.models import (\n    Device,\n    DeviceType,\n    DeepLearningFramework,\n    QuantizationType,\n    ModelCompiler,\n)\nfrom nebullvm.operations.conversions.converters import PytorchConverter\nfrom nebullvm.operations.inference_learners.tensor_rt import (\n    TENSOR_RT_INFERENCE_LEARNERS,\n    PytorchTensorRTInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.tensor_rt import (\n    ONNXTensorRTCompiler,\n    PyTorchTensorRTCompiler,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import (\n    initialize_model,\n    check_model_validity,\n)\nfrom nebullvm.operations.inference_learners.utils import load_model\nfrom nebullvm.tools.utils import check_module_version\n\ndevice = Device(DeviceType.GPU)\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (DeepLearningFramework.PYTORCH, True, None, None, None),\n        (DeepLearningFramework.PYTORCH, False, None, None, None),\n        (\n            DeepLearningFramework.PYTORCH,\n            False,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            False,\n            QuantizationType.STATIC,\n            2,\n            \"numeric_precision\",\n        ),\n    ],\n)\n@pytest.mark.skipif(\n    not torch.cuda.is_available(),\n    reason=\"Skip because cuda is not available.\",\n)\ndef test_tensorrt_onnx(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        model_path = Path(tmp_dir) / \"fp32\"\n        model_path.mkdir(parents=True)\n\n        converter_op = PytorchConverter()\n        converter_op.to(device).set_state(model, input_data).execute(\n            model_path, model_params\n        )\n\n        converted_models = converter_op.get_result()\n        assert len(converted_models) > 1\n\n        model_path = str(\n            [model for model in converted_models if isinstance(model, Path)][0]\n        )\n        compiler_op = ONNXTensorRTCompiler()\n        compiler_op.to(device).execute(\n            model=model_path,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.TENSOR_RT_ONNX\n        ]()\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(\n            optimized_model, TENSOR_RT_INFERENCE_LEARNERS[output_library]\n        )\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(\n            loaded_model, TENSOR_RT_INFERENCE_LEARNERS[output_library]\n        )\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = tuple(optimized_model.get_inputs_example())\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:\n            torch_device = torch.device(\n                \"cuda\" if torch.cuda.is_available() else \"cpu\"\n            )\n\n            inputs_example = [\n                input_[: len(input_) // 2].to(torch_device)\n                for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            with torch.inference_mode():\n                res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(\n                        res_tensor.float(), res_orig_tensor, rtol=1e-01\n                    )\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (DeepLearningFramework.PYTORCH, True, None, None, None),\n        (DeepLearningFramework.PYTORCH, False, None, None, None),\n        (\n            DeepLearningFramework.PYTORCH,\n            False,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            False,\n            QuantizationType.STATIC,\n            2,\n            \"numeric_precision\",\n        ),\n    ],\n)\n@pytest.mark.skipif(\n    not torch.cuda.is_available(),\n    reason=\"Skip because cuda is not available.\",\n)\n@pytest.mark.skipif(\n    not check_module_version(torch, max_version=\"1.13.1+cu117\"),\n    reason=\"Skip because torch version is not supported.\",\n)\ndef test_tensorrt_torch(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        compiler_op = PyTorchTensorRTCompiler()\n        compiler_op.to(device).execute(\n            model=model,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.TENSOR_RT_TORCH\n        ]()\n\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(optimized_model, PytorchTensorRTInferenceLearner)\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = PytorchTensorRTInferenceLearner.load(tmp_dir)\n        assert isinstance(loaded_model, PytorchTensorRTInferenceLearner)\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = tuple(optimized_model.get_inputs_example())\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            torch_device = torch.device(\n                \"cuda\" if torch.cuda.is_available() else \"cpu\"\n            )\n\n            inputs_example = [\n                input_[: len(input_) // 2].to(torch_device)\n                for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(\n                        res_tensor.float(), res_orig_tensor, rtol=1e-01\n                    )\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tensorflow.py",
    "content": "from tempfile import TemporaryDirectory\n\nimport pytest\n\nfrom nebullvm.core.models import (\n    DeepLearningFramework,\n    QuantizationType,\n    Device,\n    DeviceType,\n    ModelCompiler,\n)\nfrom nebullvm.operations.inference_learners.tensorflow import (\n    TensorflowBackendInferenceLearner,\n    TFLiteBackendInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.tensorflow import (\n    TensorflowBackendCompiler,\n    TFLiteBackendCompiler,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import (\n    initialize_model,\n    check_model_validity,\n)\nfrom nebullvm.operations.inference_learners.utils import load_model\nfrom nebullvm.tools.utils import gpu_is_available\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (DeepLearningFramework.TENSORFLOW, False, None, None, None),\n        (DeepLearningFramework.TENSORFLOW, True, None, None, None),\n    ],\n)\ndef test_tensorflow_backend(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    device = (\n        Device(DeviceType.GPU)\n        if gpu_is_available()\n        else Device(DeviceType.CPU)\n    )\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        compiler_op = TensorflowBackendCompiler()\n        compiler_op.to(device).execute(\n            model=model,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.XLA\n        ]()\n\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n\n        assert isinstance(optimized_model, TensorflowBackendInferenceLearner)\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(loaded_model, TensorflowBackendInferenceLearner)\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = list(optimized_model.get_inputs_example())\n        res = optimized_model.predict(*inputs_example)\n        assert res is not None\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            inputs_example = [\n                input_[: len(input_) // 2] for input_ in inputs_example\n            ]\n            res = optimized_model.predict(*inputs_example)\n            assert res is not None\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (\n            DeepLearningFramework.TENSORFLOW,\n            False,\n            None,\n            0.1,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.TENSORFLOW,\n            True,\n            None,\n            0.1,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.TENSORFLOW,\n            True,\n            QuantizationType.DYNAMIC,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.TENSORFLOW,\n            True,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.TENSORFLOW,\n            True,\n            QuantizationType.STATIC,\n            2,\n            \"numeric_precision\",\n        ),\n    ],\n)\ndef test_tf_lite(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    device = Device(DeviceType.CPU)\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        compiler_op = TFLiteBackendCompiler()\n        compiler_op.to(device).execute(\n            model=model,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.TFLITE\n        ]()\n\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n\n        assert isinstance(optimized_model, TFLiteBackendInferenceLearner)\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = TFLiteBackendInferenceLearner.load(tmp_dir)\n        assert isinstance(loaded_model, TFLiteBackendInferenceLearner)\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = list(optimized_model.get_inputs_example())\n        res = optimized_model.predict(*inputs_example)\n        assert res is not None\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            inputs_example = [\n                input_[: len(input_) // 2] for input_ in inputs_example\n            ]\n            res = optimized_model.predict(*inputs_example)\n            assert res is not None\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_torch_dynamo.py",
    "content": "import platform\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\n\nfrom nebullvm.core.models import (\n    DeviceType,\n    Device,\n    DeepLearningFramework,\n    QuantizationType,\n    ModelCompiler,\n)\nfrom nebullvm.operations.inference_learners.torch_dynamo import (\n    TorchDynamoInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.torch_dynamo import (\n    TorchDynamoCompiler,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import (\n    initialize_model,\n    check_model_validity,\n)\nfrom nebullvm.tools.utils import gpu_is_available, check_module_version\n\ndevice = (\n    Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)\n)\n\n\ndef run_test_torch_dynamo(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    with TemporaryDirectory() as tmp_dir:  # noqa: F841\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        compiler_op = TorchDynamoCompiler()\n        compiler_op.to(device).execute(\n            model=model,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n            model_params=model_params,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.TORCH_DYNAMO\n        ]()\n\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(optimized_model, TorchDynamoInferenceLearner)\n\n        # Test save and load functions\n        # optimized_model.save(tmp_dir)\n        # loaded_model = load_model(tmp_dir)\n        # assert isinstance(loaded_model, TorchDynamoInferenceLearner)\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = list(optimized_model.get_inputs_example())\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        # res_loaded = loaded_model(*inputs_example)\n        # assert all(\n        #     [\n        #         torch.allclose(res_tensor, res_loaded_tensor)\n        #         for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n        #     ]\n        # )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            torch_device = torch.device(\n                \"cuda\" if torch.cuda.is_available() else \"cpu\"\n            )\n\n            inputs_example = [\n                input_[: len(input_) // 2].to(torch_device)\n                for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(\n                        res_tensor.float(), res_orig_tensor, rtol=2e-01\n                    )\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (DeepLearningFramework.PYTORCH, True, None, None, None),\n        (DeepLearningFramework.PYTORCH, False, None, None, None),\n    ],\n)\n@pytest.mark.skipif(\n    not check_module_version(torch, min_version=\"2.0.0\"),\n    reason=\"Torch version is not supported\",\n)\n@pytest.mark.skipif(\n    platform.system() == \"Windows\",\n    reason=\"Torch compile() is not currently supported on windows\",\n)\ndef test_torch_dynamo_fp32(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    run_test_torch_dynamo(\n        output_library,\n        dynamic,\n        quantization_type,\n        metric_drop_ths,\n        metric,\n    )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_torchscript.py",
    "content": "from tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\n\nfrom nebullvm.core.models import (\n    DeviceType,\n    Device,\n    DeepLearningFramework,\n    QuantizationType,\n    ModelCompiler,\n)\nfrom nebullvm.operations.inference_learners.torchscript import (\n    TorchScriptInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.torchscript import (\n    TorchScriptCompiler,\n)\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import (\n    initialize_model,\n    check_model_validity,\n)\nfrom nebullvm.operations.inference_learners.utils import load_model\nfrom nebullvm.tools.utils import gpu_is_available\n\ndevice = (\n    Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)\n)\n\n\ndef run_test_torchscript(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        compiler_op = TorchScriptCompiler()\n        compiler_op.to(device).execute(\n            model=model,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.TORCHSCRIPT\n        ]()\n\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(optimized_model, TorchScriptInferenceLearner)\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(loaded_model, TorchScriptInferenceLearner)\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = list(optimized_model.get_inputs_example())\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:  # Check also with a smaller bath_size\n            torch_device = torch.device(\n                \"cuda\" if torch.cuda.is_available() else \"cpu\"\n            )\n\n            inputs_example = [\n                input_[: len(input_) // 2].to(torch_device)\n                for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(\n                        res_tensor.float(), res_orig_tensor, rtol=2e-01\n                    )\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (DeepLearningFramework.PYTORCH, True, None, None, None),\n        (DeepLearningFramework.PYTORCH, False, None, None, None),\n    ],\n)\ndef test_torchscript_no_quantization(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    run_test_torchscript(\n        output_library,\n        dynamic,\n        quantization_type,\n        metric_drop_ths,\n        metric,\n    )\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n        )\n    ],\n)\n@pytest.mark.skipif(\n    not torch.cuda.is_available(),\n    reason=\"Half quantization is not available on CPU\",\n)\ndef test_torchscript_half_quantization(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    run_test_torchscript(\n        output_library,\n        dynamic,\n        quantization_type,\n        metric_drop_ths,\n        metric,\n    )\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.DYNAMIC,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.STATIC,\n            2,\n            \"numeric_precision\",\n        ),\n    ],\n)\n@pytest.mark.skipif(\n    torch.cuda.is_available(),\n    reason=\"INT8 quantization is not available on GPU\",\n)\ndef test_torchscript_int8_quantization(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    run_test_torchscript(\n        output_library,\n        dynamic,\n        quantization_type,\n        metric_drop_ths,\n        metric,\n    )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tvm.py",
    "content": "from pathlib import Path\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\n\nfrom nebullvm.core.models import (\n    Device,\n    DeviceType,\n    DeepLearningFramework,\n    QuantizationType,\n    ModelCompiler,\n)\nfrom nebullvm.operations.conversions.converters import PytorchConverter\nfrom nebullvm.operations.inference_learners.tvm import (\n    PytorchApacheTVMInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.tvm import (\n    ONNXApacheTVMCompiler,\n    PyTorchApacheTVMCompiler,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import tvm_is_available\nfrom nebullvm.operations.optimizations.optimizers.base import (\n    COMPILER_TO_INFERENCE_LEARNER_MAP,\n)\nfrom nebullvm.operations.optimizations.tests.utils import (\n    initialize_model,\n    check_model_validity,\n)\nfrom nebullvm.operations.inference_learners.utils import load_model\nfrom nebullvm.tools.utils import gpu_is_available\n\ndevice = (\n    Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)\n)\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (DeepLearningFramework.PYTORCH, True, None, None, None),\n        (DeepLearningFramework.PYTORCH, False, None, None, None),\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.DYNAMIC,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n        ),\n        # (\n        #     DeepLearningFramework.PYTORCH,\n        #     True,\n        #     QuantizationType.STATIC,\n        #     2,\n        #     \"numeric_precision\",\n        # ),\n    ],\n)\n@pytest.mark.skipif(\n    not tvm_is_available(), reason=\"Apache TVM is not installed\"\n)\ndef test_tvm_onnx(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n\n        model_path = Path(tmp_dir) / \"fp32\"\n        model_path.mkdir(parents=True)\n\n        converter_op = PytorchConverter()\n        converter_op.to(device).set_state(model, input_data).execute(\n            model_path, model_params\n        )\n\n        converted_models = converter_op.get_result()\n        assert len(converted_models) > 1\n\n        model_path = str(\n            [model for model in converted_models if isinstance(model, Path)][0]\n        )\n\n        compiler_op = ONNXApacheTVMCompiler()\n        compiler_op.to(device).execute(\n            model=model_path,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.APACHE_TVM_ONNX\n        ]()\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(loaded_model, PytorchApacheTVMInferenceLearner)\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = optimized_model.get_inputs_example()\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        if dynamic:\n            inputs_example = [\n                input_[: len(input_) // 2] for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(\n                        res_tensor.float(), res_orig_tensor, rtol=1e-01\n                    )\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n\n\n@pytest.mark.parametrize(\n    (\n        \"output_library\",\n        \"dynamic\",\n        \"quantization_type\",\n        \"metric_drop_ths\",\n        \"metric\",\n    ),\n    [\n        (DeepLearningFramework.PYTORCH, True, None, None, None),\n        (DeepLearningFramework.PYTORCH, False, None, None, None),\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.DYNAMIC,\n            2,\n            \"numeric_precision\",\n        ),\n        (\n            DeepLearningFramework.PYTORCH,\n            True,\n            QuantizationType.HALF,\n            2,\n            \"numeric_precision\",\n        ),\n        # (\n        #     DeepLearningFramework.PYTORCH,\n        #     True,\n        #     QuantizationType.STATIC,\n        #     2,\n        #     \"numeric_precision\",\n        # ),\n    ],\n)\n@pytest.mark.skipif(\n    not tvm_is_available(), reason=\"Can't test tvm if it's not installed.\"\n)\ndef test_tvm_torch(\n    output_library: DeepLearningFramework,\n    dynamic: bool,\n    quantization_type: QuantizationType,\n    metric_drop_ths: int,\n    metric: str,\n):\n    with TemporaryDirectory() as tmp_dir:\n        (\n            model,\n            input_data,\n            model_params,\n            input_tfms,\n            model_outputs,\n            metric,\n        ) = initialize_model(dynamic, metric, output_library, device)\n        compiler_op = PyTorchApacheTVMCompiler()\n        compiler_op.to(device).execute(\n            model=model,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            metric_drop_ths=metric_drop_ths,\n            quantization_type=quantization_type,\n            input_data=input_data,\n        )\n\n        compiled_model = compiler_op.get_result()\n\n        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[\n            ModelCompiler.APACHE_TVM_TORCH\n        ]()\n        build_inference_learner_op.to(device).execute(\n            model=compiled_model,\n            model_orig=compiler_op.model_orig\n            if hasattr(compiler_op, \"model_orig\")\n            else None,\n            model_params=model_params,\n            input_tfms=input_tfms,\n            source_dl_framework=output_library,\n        )\n\n        optimized_model = build_inference_learner_op.get_result()\n        assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)\n\n        # Test save and load functions\n        optimized_model.save(tmp_dir)\n        loaded_model = PytorchApacheTVMInferenceLearner.load(tmp_dir)\n        assert isinstance(loaded_model, PytorchApacheTVMInferenceLearner)\n\n        assert isinstance(optimized_model.get_size(), int)\n\n        inputs_example = optimized_model.get_inputs_example()\n        res = optimized_model(*inputs_example)\n        assert res is not None\n\n        res_loaded = loaded_model(*inputs_example)\n        assert all(\n            [\n                torch.allclose(res_tensor, res_loaded_tensor)\n                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)\n            ]\n        )\n\n        # Test validity of the model\n        valid = check_model_validity(\n            optimized_model,\n            input_data,\n            model_outputs,\n            metric_drop_ths,\n            quantization_type,\n            metric,\n        )\n        assert valid\n\n        if dynamic:\n            inputs_example = [\n                input_[: len(input_) // 2] for input_ in inputs_example\n            ]\n            res = optimized_model(*inputs_example)\n            assert res is not None\n\n            res_orig = tuple(model(*inputs_example))\n            assert all(\n                [\n                    torch.allclose(\n                        res_tensor.float(), res_orig_tensor, rtol=1e-01\n                    )\n                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)\n                ]\n            )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/tests/utils.py",
    "content": "import os\nfrom pathlib import Path\nfrom typing import Any, Callable, Optional, Tuple\n\nimport tensorflow as tf\nimport tensorflow.keras as keras\nimport torch\nfrom tensorflow.keras import Model, layers\nfrom transformers import AlbertModel, AlbertTokenizer\n\nfrom nebullvm.config import TRAIN_TEST_SPLIT_RATIO, CONSTRAINED_METRIC_DROP_THS\nfrom nebullvm.core.models import (\n    DeepLearningFramework,\n    ModelParams,\n    DataType,\n    DeviceType,\n    Device,\n    QuantizationType,\n)\nfrom nebullvm.operations.conversions.huggingface import convert_hf_model\nfrom nebullvm.operations.conversions.pytorch import convert_torch_to_onnx\nfrom nebullvm.operations.measures.measures import (\n    LatencyOriginalModelMeasure,\n    MetricDropMeasure,\n)\nfrom nebullvm.operations.measures.utils import compute_relative_difference\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.transformations import MultiStageTransformation\nfrom nebullvm.tools.utils import gpu_is_available, extract_info_from_data\n\nINPUT_SHAPE = (3, 256, 256)\nOUTPUT_SHAPE = (2,)\nSTATIC_BATCH_SIZE = 1\nDYNAMIC_BATCH_SIZE = 2\n\n\nclass TestModel(torch.nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = torch.nn.Conv2d(\n            in_channels=3, out_channels=64, kernel_size=3\n        )\n        self.relu1 = torch.nn.ReLU()\n        self.conv2 = torch.nn.Conv2d(\n            in_channels=64, out_channels=32, kernel_size=3\n        )\n        self.relu2 = torch.nn.ReLU()\n        self.fcn = torch.nn.Linear(32, 2)\n\n    def forward(self, input_tensor_0, input_tensor_1):\n        x0 = self.relu2(self.conv2(self.relu1(self.conv1(input_tensor_0))))\n        x1 = self.relu2(self.conv2(self.relu1(self.conv1(input_tensor_1))))\n        x = x0 + x1\n        x = self.fcn(x.mean(dim=(-2, -1)).view(-1, 32))\n        return x\n\n\ndef tensorflow_model():\n    input_0 = keras.Input(shape=(256, 256, 3))\n    input_1 = keras.Input(shape=(256, 256, 3))\n    x0 = layers.Conv2D(64, kernel_size=(3, 3), activation=\"relu\")(input_0)\n    x1 = layers.Conv2D(64, kernel_size=(3, 3), activation=\"relu\")(input_1)\n    x0 = layers.Conv2D(32, kernel_size=(3, 3), activation=\"relu\")(x0)\n    x1 = layers.Conv2D(32, kernel_size=(3, 3), activation=\"relu\")(x1)\n    x = x0 + x1\n    y = layers.Dense(2, activation=\"softmax\")(x)\n    return Model(inputs=[input_0, input_1], outputs=y)\n\n\ndef _build_static_model(\n    framework: DeepLearningFramework = DeepLearningFramework.PYTORCH,\n) -> Tuple[torch.nn.Module, ModelParams]:\n    model_params = {\n        \"batch_size\": STATIC_BATCH_SIZE,\n        \"input_infos\": [\n            {\"size\": (STATIC_BATCH_SIZE, *INPUT_SHAPE), \"dtype\": \"float32\"},\n            {\"size\": (STATIC_BATCH_SIZE, *INPUT_SHAPE), \"dtype\": \"float32\"},\n        ],\n        \"output_sizes\": [\n            (STATIC_BATCH_SIZE, *OUTPUT_SHAPE),\n        ],\n        \"output_types\": [DataType.FLOAT32],\n    }\n    model_params = ModelParams(**model_params)\n    if framework == DeepLearningFramework.PYTORCH:\n        model = TestModel()\n    elif framework == DeepLearningFramework.TENSORFLOW:\n        model = tensorflow_model()\n    else:\n        raise NotImplementedError\n    return model, model_params\n\n\ndef _build_dynamic_model(\n    framework: DeepLearningFramework,\n) -> Tuple[torch.nn.Module, ModelParams]:\n    model_params = {\n        \"batch_size\": DYNAMIC_BATCH_SIZE,\n        \"input_infos\": [\n            {\"size\": (DYNAMIC_BATCH_SIZE, *INPUT_SHAPE), \"dtype\": \"float32\"},\n            {\"size\": (DYNAMIC_BATCH_SIZE, *INPUT_SHAPE), \"dtype\": \"float32\"},\n        ],\n        \"output_sizes\": [\n            (DYNAMIC_BATCH_SIZE, *OUTPUT_SHAPE),\n        ],\n        \"output_types\": [DataType.FLOAT32],\n        \"dynamic_info\": {\n            \"inputs\": [\n                {\n                    0: {\n                        \"name\": \"batch\",\n                        \"min_val\": 1,\n                        \"opt_val\": 1,\n                        \"max_val\": 2,\n                    }\n                },\n                {\n                    0: {\n                        \"name\": \"batch\",\n                        \"min_val\": 1,\n                        \"opt_val\": 1,\n                        \"max_val\": 2,\n                    }\n                },\n            ],\n            \"outputs\": [{0: \"batch\"}],\n        },\n    }\n    if framework == DeepLearningFramework.PYTORCH:\n        model = TestModel()\n    elif framework == DeepLearningFramework.TENSORFLOW:\n        model = tensorflow_model()\n    else:\n        raise NotImplementedError()\n    return model, ModelParams(**model_params)\n\n\ndef get_torch_model(dynamic: bool = False):\n    if dynamic:\n        model, model_params = _build_dynamic_model(\n            DeepLearningFramework.PYTORCH\n        )\n    else:\n        model, model_params = _build_static_model(\n            DeepLearningFramework.PYTORCH\n        )\n    return model, model_params\n\n\ndef get_tensorflow_model(dynamic: bool = False):\n    if dynamic:\n        model, model_params = _build_dynamic_model(\n            DeepLearningFramework.TENSORFLOW\n        )\n    else:\n        model, model_params = _build_static_model(\n            DeepLearningFramework.TENSORFLOW\n        )\n    return model, model_params\n\n\ndef get_huggingface_model(temp_dir: str, dl_framework: DeepLearningFramework):\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n    model = AlbertModel.from_pretrained(\"albert-base-v1\")\n\n    text = \"Short text you wish to process\"\n    encoded_input = tokenizer(text, return_tensors=\"pt\")\n    device = (\n        Device(DeviceType.GPU)\n        if gpu_is_available()\n        else Device(DeviceType.CPU)\n    )\n\n    (\n        model,\n        input_data,\n        input_names,\n        output_structure,\n        output_type,\n    ) = convert_hf_model(model, [encoded_input], device=device)\n\n    input_data = DataManager(input_data)\n    input_data.split(TRAIN_TEST_SPLIT_RATIO)\n\n    # Benchmark original model\n    benchmark_orig_model_op = LatencyOriginalModelMeasure()\n    benchmark_orig_model_op.to(device).execute(\n        model=model,\n        input_data=input_data.get_split(\"test\"),\n        dl_framework=dl_framework,\n    )\n\n    model_outputs = benchmark_orig_model_op.get_result()[0]\n\n    model_path = os.path.join(temp_dir, \"test_model.onnx\")\n\n    model_params = extract_info_from_data(\n        model, input_data, dl_framework, None, device\n    )\n\n    device = DeviceType.GPU if gpu_is_available() else DeviceType.CPU\n    convert_torch_to_onnx(\n        model, input_data, model_params, Path(model_path), device\n    )\n\n    return (\n        model_path,\n        model_params,\n        output_structure,\n        input_names,\n        output_type,\n        input_data,\n        model_outputs,\n    )\n\n\ndef initialize_model(\n    dynamic: bool,\n    metric: Optional[str],\n    output_library: DeepLearningFramework,\n    device: Device,\n):\n    torch_device = torch.device(\n        \"cuda\" if device.type is DeviceType.GPU else \"cpu\"\n    )\n    batch_size = DYNAMIC_BATCH_SIZE if dynamic else STATIC_BATCH_SIZE\n\n    if output_library == DeepLearningFramework.PYTORCH:\n        model, model_params = get_torch_model(dynamic)\n\n        input_data = DataManager(\n            [\n                (\n                    (\n                        torch.randn(batch_size, *INPUT_SHAPE).to(torch_device),\n                        torch.randn(batch_size, *INPUT_SHAPE).to(torch_device),\n                    ),\n                    torch.zeros(batch_size, dtype=torch.long),\n                )\n            ]\n        )\n    elif output_library == DeepLearningFramework.TENSORFLOW:\n        model, model_params = get_tensorflow_model(dynamic)\n        input_data = DataManager(\n            [\n                (\n                    (\n                        tf.random_normal_initializer()(\n                            shape=(\n                                batch_size,\n                                *INPUT_SHAPE[1:],\n                                INPUT_SHAPE[0],\n                            )\n                        ),\n                        tf.random_normal_initializer()(\n                            shape=(\n                                batch_size,\n                                *INPUT_SHAPE[1:],\n                                INPUT_SHAPE[0],\n                            )\n                        ),\n                    ),\n                    [0 for _ in range(batch_size)],\n                )\n            ]\n        )\n\n    input_data.split(TRAIN_TEST_SPLIT_RATIO)\n    input_tfms = MultiStageTransformation([])\n\n    # Benchmark original model\n    benchmark_orig_model_op = LatencyOriginalModelMeasure()\n    benchmark_res = benchmark_orig_model_op.to(device).execute(\n        model=model,\n        input_data=input_data.get_split(\"test\"),\n        dl_framework=output_library,\n    )\n\n    model_outputs = benchmark_res.model_outputs\n\n    if metric is not None:\n        metric = compute_relative_difference\n\n    return model, input_data, model_params, input_tfms, model_outputs, metric\n\n\ndef check_model_validity(\n    optimized_model: Any,\n    input_data: DataManager,\n    model_outputs: Any,\n    metric_drop_ths: float,\n    quantization_type: QuantizationType,\n    metric: Callable,\n) -> bool:\n    test_input_data, ys = input_data.get_split(\"test\").get_list(with_ys=True)\n    validity_check_op = MetricDropMeasure()\n    validity_check_op.execute(\n        optimized_model,\n        test_input_data,\n        model_outputs,\n        metric_drop_ths\n        if metric_drop_ths is not None\n        else CONSTRAINED_METRIC_DROP_THS,\n        metric_func=metric\n        if quantization_type is not None\n        else compute_relative_difference,\n        ys=ys,\n    )\n\n    print(validity_check_op.get_result()[1])\n\n    return validity_check_op.get_result()[0]\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/operations/optimizations/utils.py",
    "content": "from typing import Callable, List\n\n\ndef map_compilers_and_compressors(ignore_list: List, enum_class: Callable):\n    if ignore_list is None:\n        ignore_list = []\n    else:\n        ignore_list = [enum_class(element) for element in ignore_list]\n    return ignore_list\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/blade_disc.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import torch_blade\nexcept ImportError:\n    torch_blade = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/deepsparse.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    from deepsparse import compile_model, cpu\nexcept ImportError:\n    compile_model = cpu = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/diffusers.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import diffusers  # noqa F401\n    from diffusers import (\n        StableDiffusionPipeline,\n        DiffusionPipeline,\n    )  # noqa F401\n    from diffusers.models import (\n        AutoencoderKL,\n        UNet2DConditionModel,\n    )  # noqa F401\n    from diffusers.models.unet_2d import UNet2DOutput  # noqa F401\nexcept ImportError:\n    diffusers = DummyClass\n    StableDiffusionPipeline = DummyClass\n    DiffusionPipeline = DummyClass\n    UNet2DConditionModel = DummyClass\n    AutoencoderKL = DummyClass\n    UNet2DOutput = DummyClass\n\ntry:\n    import onnx_graphsurgeon  # noqa F401\nexcept ImportError:\n    onnx_graphsurgeon = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/dummy.py",
    "content": "class DummyClass:\n    pass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/huggingface.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    from transformers import PreTrainedModel, CLIPTextModel, CLIPTokenizer\n    from transformers.tokenization_utils import PreTrainedTokenizer\n    from transformers.models.bert.modeling_bert import (\n        BertModel,\n        BertEmbeddings,\n        BertEncoder,\n        BertPooler,\n        BertPreTrainedModel,\n    )\n    from transformers import BertConfig, GPT2Tokenizer, GPT2LMHeadModel\nexcept ImportError:\n    # add placeholders for function definition\n    PreTrainedModel = DummyClass\n    CLIPTextModel = DummyClass\n    CLIPTokenizer = DummyClass\n    PreTrainedTokenizer = DummyClass\n    BertModel = DummyClass\n    BertEmbeddings = DummyClass\n    BertEncoder = DummyClass\n    BertPooler = DummyClass\n    BertPreTrainedModel = DummyClass\n    BertConfig = DummyClass\n    GPT2Tokenizer = DummyClass\n    GPT2LMHeadModel = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/neural_compressor.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import neural_compressor  # noqa F401\n    from neural_compressor.adaptor.pytorch import (\n        _cfg_to_qconfig as cfg_to_qconfig,\n        _cfgs_to_fx_cfgs as cfgs_to_fx_cfgs,\n    )\n    from neural_compressor.experimental import (\n        MixedPrecision,\n        Quantization,\n        Pruning,\n    )\nexcept ImportError:\n    cfg_to_qconfig = cfgs_to_fx_cfgs = None\n    MixedPrecision = Quantization = Pruning = DummyClass\nexcept ValueError:\n    # MacOS\n    cfg_to_qconfig = cfgs_to_fx_cfgs = None\n    MixedPrecision = Quantization = Pruning = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/onnx.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import onnx  # noqa F401\nexcept ImportError:\n    onnx = DummyClass\n\ntry:\n    import onnxmltools  # noqa F401\n    from onnxmltools.utils.float16_converter import (  # noqa F401\n        convert_float_to_float16_model_path,\n    )\n\nexcept ImportError:\n    convert_float_to_float16_model_path = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/onnxruntime.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import onnxruntime  # noqa F401\n    from onnxruntime.quantization import (\n        QuantType,\n        quantize_static,\n        quantize_dynamic,\n        CalibrationDataReader,\n    )\nexcept ImportError:\n    onnxruntime = DummyClass\n    setattr(onnxruntime, \"SessionOptions\", None)\n    QuantType = quantize_static = quantize_dynamic = None\n    CalibrationDataReader = DummyClass\nexcept FileNotFoundError:\n    # Solves a colab issue\n    QuantType = quantize_static = quantize_dynamic = None\n    CalibrationDataReader = DummyClass\n\ntry:\n    # They require torch\n    from onnxruntime.transformers import optimizer\n    from onnxruntime.transformers.optimizer import MODEL_TYPES\nexcept ImportError:\n    MODEL_TYPES = DummyClass\n    optimizer = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/onnxsim.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import onnxsim\nexcept ImportError:\n    onnxsim = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/openvino.py",
    "content": "import logging\n\nfrom nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    from openvino.runtime import Core, Model, CompiledModel, InferRequest\n    from openvino.tools.pot import DataLoader\n    from openvino.tools.pot import IEEngine\n    from openvino.tools.pot import load_model, save_model\n    from openvino.tools.pot import compress_model_weights\n    from openvino.tools.pot import create_pipeline\nexcept ImportError:\n    Model = CompiledModel = InferRequest = Core = DummyClass\n    DataLoader = IEEngine = DummyClass\n    load_model = save_model = compress_model_weights = create_pipeline = None\n\n# Fix openvino issue with logging\n# It adds a second handler to the root logger that cause issues\nif len(logging.getLogger().handlers) > 1:\n    logging.getLogger().removeHandler(logging.getLogger().handlers[-1])\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/tensor_rt.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import tensorrt\n    from tensorrt import IInt8EntropyCalibrator2\nexcept ImportError:\n    tensorrt = DummyClass\n    IInt8EntropyCalibrator2 = DummyClass\n\ntry:\n    import polygraphy.cuda as polygraphy\n    from polygraphy.logger import G_LOGGER\n\n    G_LOGGER.module_severity = 40\n    from polygraphy.backend.onnx.loader import fold_constants\nexcept ImportError:\n    polygraphy = DummyClass\n    fold_constants = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/tensorflow.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import absl.logging\n\n    absl.logging.set_verbosity(absl.logging.ERROR)\nexcept Exception:\n    pass\n\n\nclass Keras:\n    Model = DummyClass\n\n\nclass data:\n    Dataset = DummyClass\n\n\nclass dtypes:\n    DType = DummyClass\n\n\nclass Tensorflow:\n    Module = DummyClass\n    Tensor = DummyClass\n    keras = Keras()\n    data = data\n    dtypes = dtypes\n    float16 = float32 = int32 = int64 = DummyClass\n\n    @staticmethod\n    def function(**kwargs):\n        return lambda x: x\n\n\ntry:\n    import tensorflow  # noqa F401\n\n    physical_devices = tensorflow.config.experimental.list_physical_devices(\n        \"GPU\"\n    )\n    if len(physical_devices) > 0:\n        for physical_device in physical_devices:\n            tensorflow.config.experimental.set_memory_growth(\n                physical_device, True\n            )\n\n    tensorflow.get_logger().setLevel(\"ERROR\")\n    tensorflow.autograph.set_verbosity(0)\nexcept (ImportError, AttributeError):\n    tensorflow = Tensorflow\n\n\ntry:\n    import tf2onnx  # noqa F401\n\n    tf2onnx.logging.set_level(\"ERROR\")\n    tf2onnx.logging.set_tf_verbosity(\"ERROR\")\nexcept ImportError:\n    tf2onnx = object\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/torch.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import torch  # noqa F401\n    from torch.nn import Module  # noqa F401\n    from torch.jit import ScriptModule  # noqa F401\n    from torch.fx import GraphModule\n    from torch.utils.data import DataLoader, Dataset  # noqa F401\n    from torch.quantization.quantize_fx import (  # noqa F401\n        prepare_fx,\n        convert_fx,\n    )\n\n    from torch.ao.quantization.stubs import QuantStub, DeQuantStub\n    from torch.fx import symbolic_trace\n    from torch.quantization import default_dynamic_qconfig\n    import torch.distributed as torch_distributed\nexcept ImportError:\n\n    class nn:\n        Module = DummyClass\n\n    class jit:\n        ScriptModule = DummyClass\n\n    class fx:\n        GraphModule = DummyClass\n\n    class torch:\n        float = half = int8 = DummyClass\n        float16 = float32 = int32 = int64 = DummyClass\n        Tensor = DummyClass\n        dtype = DummyClass\n        nn = nn\n        jit = jit\n        Generator = DummyClass\n        FloatTensor = DummyClass\n        fx = fx\n\n        @staticmethod\n        def no_grad():\n            return lambda x: None\n\n        @staticmethod\n        def inference_mode():\n            return lambda x: None\n\n    Dataset = DummyClass\n    Module = DummyClass\n    ScriptModule = DummyClass\n    GraphModule = DummyClass\n    DataLoader = DummyClass\n    symbolic_trace = None\n    QuantStub = DeQuantStub = DummyClass\n    default_dynamic_qconfig = prepare_fx = convert_fx = None\n    Generator = DummyClass\n    FloatTensor = DummyClass\n    torch_distributed = None\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/torch_neuron.py",
    "content": "import logging\n\nfrom nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import torch_neuron  # noqa F401\n\n    logging.getLogger(\"Neuron\").setLevel(logging.WARNING)\nexcept ImportError:\n    try:\n        import torch_neuronx  # noqa F401\n\n        logging.getLogger(\"Neuron\").setLevel(logging.WARNING)\n    except ImportError:\n        torch_neuron = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/torch_tensorrt.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import torch_tensorrt\n    from torch_tensorrt.ptq import DataLoaderCalibrator  # noqa F401\nexcept ImportError:\n    torch_tensorrt = DummyClass\n    DataLoaderCalibrator = None\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/torch_xla.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import torch_xla\n    import torch_xla.core.xla_model as xm\nexcept ImportError:\n    torch_xla = DummyClass\n    xm = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/tvm.py",
    "content": "from nebullvm.optional_modules.dummy import DummyClass\n\ntry:\n    import tvm\n    from tvm import IRModule\n    from tvm.runtime.ndarray import NDArray\n    from tvm.autotvm.tuner import XGBTuner\n    from tvm import autotvm\n    import tvm.relay as relay\n    from tvm.relay.transform import ToMixedPrecision\n    from tvm.contrib.graph_executor import GraphModule\n    from tvm.runtime import Module\n    from tvm.relay.backend.executor_factory import ExecutorFactoryModule\nexcept ImportError:\n    tvm = (\n        IRModule\n    ) = (\n        NDArray\n    ) = (\n        XGBTuner\n    ) = (\n        ExecutorFactoryModule\n    ) = autotvm = relay = ToMixedPrecision = GraphModule = Module = DummyClass\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/optional_modules/utils.py",
    "content": "import cpuinfo\nfrom loguru import logger\n\nfrom nebullvm.core.models import Device, DeviceType\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    bladedisc_is_available,\n    deepsparse_is_available,\n    faster_transformer_is_available,\n    intel_neural_compressor_is_available,\n    onnxruntime_is_available,\n    openvino_is_available,\n    tensorrt_is_available,\n    torch_tensorrt_is_available,\n    torch_neuron_is_available,\n    torch_xla_is_available,\n    tvm_is_available,\n)\nfrom nebullvm.tools.utils import gpu_is_available, check_module_version\n\n\ndef torch_is_available() -> bool:\n    try:\n        import torch  # noqa F401\n\n        if not torch.cuda.is_available() and gpu_is_available():\n            logger.warning(\n                \"Installed PyTorch does not have cuda support. \"\n                \"Please ensure that torch.cuda.is_available() \"\n                \"returns True by installing the proper version \"\n                \"of PyTorch. \"\n            )\n\n        if not check_module_version(torch, min_version=\"1.10.0\"):\n            logger.warning(\n                \"torch module version must be >= 1.10.0. \"\n                \"Please update it if you want to use it.\"\n            )\n            return False\n    except ImportError:\n        return False\n    else:\n        return True\n\n\ndef tensorflow_is_available() -> bool:\n    try:\n        import tensorflow  # noqa F401\n\n        if not check_module_version(tensorflow, min_version=\"2.7.0\"):\n            logger.warning(\n                \"tensorflow module version must be >= 2.7.0. \"\n                \"Please update it if you want to use it.\"\n            )\n            return False\n    except ImportError:\n        return False\n    else:\n        return True\n\n\ndef onnx_is_available() -> bool:\n    try:\n        import onnx  # noqa F401\n\n        if not check_module_version(onnx, min_version=\"1.10.0\"):\n            logger.warning(\n                \"onnx module version must be >= 1.10.0. \"\n                \"Please update it if you want to use it.\"\n            )\n            return False\n        return True\n    except ImportError:\n        return False\n\n\ndef _onnxmltools_is_available():\n    try:\n        import onnxmltools  # noqa F401\n\n        if not check_module_version(onnxmltools, min_version=\"1.11.0\"):\n            logger.warning(\n                \"onnxmltools module version must be >= 1.11.0. \"\n                \"Please update it if you want to use the ONNX API \"\n                \"or the ONNX pipeline for PyTorch and Tensorflow.\"\n            )\n            return False\n        else:\n            return True\n    except ImportError:\n        return False\n\n\ndef _onnxsim_is_available():\n    try:\n        import onnxsim  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef _polygraphy_is_available():\n    try:\n        import polygraphy.cuda  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef tf2onnx_is_available():\n    try:\n        import tf2onnx  # noqa F401\n\n        return True\n    except ImportError:\n        return False\n\n\ndef check_dependencies(device: Device):\n    missing_frameworks = []\n    missing_suggested_compilers = []\n    missing_optional_compilers = []\n    missing_dependencies = []\n\n    processor = cpuinfo.get_cpu_info()[\"brand_raw\"].lower()\n\n    if device.type is DeviceType.TPU:\n        if not torch_is_available():\n            missing_frameworks.append(\"torch\")\n        if not torch_xla_is_available():\n            missing_dependencies.append(\"torch_xla\")\n    elif device.type is DeviceType.NEURON:\n        if not torch_is_available():\n            missing_frameworks.append(\"torch\")\n        if not torch_neuron_is_available():\n            missing_dependencies.append(\"torch_neuron\")\n    else:\n        if not onnx_is_available():\n            missing_frameworks.append(\"onnx\")\n\n        if not tvm_is_available():\n            missing_optional_compilers.append(\"tvm\")\n        if not onnxruntime_is_available():\n            missing_suggested_compilers.append(\"onnxruntime\")\n        elif not _onnxmltools_is_available():\n            missing_dependencies.append(\"onnxmltools\")\n        if not faster_transformer_is_available():\n            missing_optional_compilers.append(\"faster_transformer\")\n        if device.type is DeviceType.GPU:\n            if not tensorrt_is_available():\n                missing_suggested_compilers.append(\"tensorrt\")\n            else:\n                if not _onnxsim_is_available():\n                    missing_dependencies.append(\"onnxsim\")\n                elif not _polygraphy_is_available():\n                    missing_dependencies.append(\"polygraphy\")\n        if device.type is DeviceType.CPU:\n            if not openvino_is_available() and \"intel\" in processor:\n                missing_suggested_compilers.append(\"openvino\")\n\n        if torch_is_available():\n            if not tvm_is_available():\n                if \"tvm\" not in missing_optional_compilers:\n                    missing_optional_compilers.append(\"tvm\")\n            if not bladedisc_is_available():\n                missing_optional_compilers.append(\"torch_blade\")\n\n            if device.type is DeviceType.CPU:\n                if not deepsparse_is_available() and \"intel\" in processor:\n                    missing_suggested_compilers.append(\"deepsparse\")\n                if (\n                    not intel_neural_compressor_is_available()\n                    and \"intel\" in processor\n                ):\n                    missing_suggested_compilers.append(\"neural_compressor\")\n            elif device.type is DeviceType.GPU:\n                if not torch_tensorrt_is_available:\n                    missing_suggested_compilers.append(\"torch_tensorrt\")\n        else:\n            missing_frameworks.append(\"torch\")\n\n        if tensorflow_is_available():\n            if not tf2onnx_is_available():\n                missing_dependencies.append(\"tf2onnx\")\n        else:\n            missing_frameworks.append(\"tensorflow\")\n\n    missing_frameworks = \", \".join(missing_frameworks)\n    if len(missing_frameworks) > 0:\n        logger.warning(\n            f\"Missing Frameworks: {missing_frameworks}.\\n \"\n            f\"Please install them \"\n            \"to include them in the optimization pipeline.\"\n        )\n\n    missing_suggested_compilers = \", \".join(missing_suggested_compilers)\n    if len(missing_suggested_compilers) > 0:\n        logger.warning(\n            f\"Missing Compilers: {missing_suggested_compilers}.\\n \"\n            f\"Please install them \"\n            \"to include them in the optimization pipeline.\"\n        )\n\n    missing_dependencies = \", \".join(missing_dependencies)\n    if len(missing_dependencies) > 0:\n        logger.warning(\n            f\"Missing Dependencies: {missing_dependencies}.\\n \"\n            f\"Without them, some compilers \"\n            f\"may not work properly.\"\n        )\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/adapters.py",
    "content": "import abc\nimport copy\nfrom abc import abstractmethod\nimport time\nfrom typing import List, Any, Union\n\nfrom loguru import logger\n\nfrom nebullvm.core.models import (\n    Device,\n    DeviceType,\n    OptimizedModel,\n    OriginalModel,\n)\nfrom nebullvm.operations.conversions.huggingface import convert_hf_model\nfrom nebullvm.operations.inference_learners.base import (\n    BaseInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.huggingface import (\n    DiffusionInferenceLearner,\n)\nfrom nebullvm.optional_modules.diffusers import StableDiffusionPipeline\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.diffusers import (\n    get_unet_inputs,\n    preprocess_diffusers,\n    postprocess_diffusers,\n)\nfrom nebullvm.tools.pytorch import get_torch_model_size\nfrom nebullvm.tools.utils import (\n    is_huggingface_data,\n    check_module_version,\n    get_throughput,\n)\n\n\nclass ModelAdapter(abc.ABC):\n    @property\n    @abstractmethod\n    def adapted_model(self):\n        pass\n\n    @property\n    @abstractmethod\n    def adapted_data(self):\n        pass\n\n    @abstractmethod\n    def adapt_inference_learner(\n        self, optimized_model: OptimizedModel\n    ) -> BaseInferenceLearner:\n        pass\n\n    @abstractmethod\n    def adapt_original_model(\n        self, original_model: OriginalModel\n    ) -> OriginalModel:\n        pass\n\n\nclass DiffusionAdapter(ModelAdapter):\n    def __init__(\n        self,\n        original_pipeline: StableDiffusionPipeline,\n        data: List,\n        device: Device,\n    ):\n        self.original_pipeline = copy.deepcopy(original_pipeline)\n        self.original_data = data\n        self.device = device\n        self.__adapted = False\n        self.__df_model = None\n        self.__df_data = None\n\n    @torch.no_grad()\n    def __benchmark_pipeline(\n        self,\n        pipe: Union[StableDiffusionPipeline, BaseInferenceLearner],\n        num_warmup_steps=2,\n        num_steps=3,\n    ):\n\n        # Warmup\n        for i in range(num_warmup_steps):\n            _ = pipe(self.original_data[i % len(self.original_data)]).images[0]\n\n        start = time.time()\n        # Benchmark\n        for i in range(num_steps):\n            _ = pipe(self.original_data[i % len(self.original_data)]).images[0]\n\n        took = time.time() - start\n\n        return took / num_steps\n\n    def __adapt(self):\n        if not check_module_version(torch, max_version=\"1.13.1+cu117\"):\n            raise ValueError(\n                \"Diffusion models are only supported in PyTorch \"\n                \"versions <= 1.13.1. Please downgrade your PyTorch \"\n                \"version and try again.\"\n            )\n\n        model = copy.deepcopy(self.original_pipeline)\n        model.get_unet_inputs = get_unet_inputs\n        model.to(self.device.to_torch_format())\n        self.__df_data = [\n            (\n                tuple(\n                    d.reshape((1,)) if d.shape == torch.Size([]) else d\n                    for d in model.get_unet_inputs(\n                        model,\n                        prompt=prompt,\n                    )\n                    if d is not None\n                ),\n                None,\n            )\n            for prompt in self.original_data\n        ]\n        self.__df_model = preprocess_diffusers(model)\n        self.__adapted = True\n\n    @property\n    def adapted_model(self):\n        if self.__adapted is False:\n            self.__adapt()\n        return self.__df_model\n\n    @property\n    def adapted_data(self):\n        if self.__adapted is False:\n            self.__adapt()\n        return self.__df_data\n\n    def adapt_inference_learner(\n        self, optimized_model: OptimizedModel\n    ) -> OptimizedModel:\n        pipe = copy.deepcopy(self.original_pipeline)\n        pipe.to(self.device.to_torch_format())\n        if self.device.type is DeviceType.GPU:\n            try:\n                pipe.enable_xformers_memory_efficient_attention()\n            except Exception:\n                pass\n\n        pipe = postprocess_diffusers(\n            optimized_model.inference_learner,\n            pipe,\n            self.device,\n        )\n        logger.info(\"Benchmarking optimized pipeline...\")\n        optimized_model.latency_seconds = self.__benchmark_pipeline(pipe)\n        optimized_model.throughput = get_throughput(\n            optimized_model.latency_seconds\n        )\n        optimized_model.inference_learner = DiffusionInferenceLearner(pipe)\n        optimized_model.size_mb += (\n            sum(\n                [\n                    get_torch_model_size(v)\n                    for (k, v) in pipe.__dict__.items()\n                    if isinstance(v, torch.nn.Module) and k != \"unet\"\n                ]\n            )\n            / 1e6\n        )\n        return optimized_model\n\n    def adapt_original_model(\n        self, original_model: OriginalModel\n    ) -> OriginalModel:\n        pipe = copy.deepcopy(self.original_pipeline)\n        pipe.to(self.device.to_torch_format())\n        logger.info(\"Benchmarking original pipeline...\")\n        original_model.latency_seconds = self.__benchmark_pipeline(pipe)\n        original_model.throughput = get_throughput(\n            original_model.latency_seconds\n        )\n        original_model.size_mb += (\n            sum(\n                [\n                    get_torch_model_size(v)\n                    for (k, v) in pipe.__dict__.items()\n                    if isinstance(v, torch.nn.Module) and k != \"unet\"\n                ]\n            )\n            / 1e6\n        )\n        return original_model\n\n\nclass HuggingFaceAdapter(ModelAdapter):\n    def __init__(self, model: Any, data: List, device: Device, **kwargs):\n        self.original_model = model\n        self.original_data = data\n        self.device = device\n        self.tokenizer_params = kwargs\n        self.__adapted = False\n        self.__hf_model = None\n        self.__hf_data = None\n        self.__hf_input_names = None\n        self.__hf_output_type = None\n        self.__hf_output_structure = None\n\n    def __adapt_model(self):\n        if not is_huggingface_data(self.original_data[0]):\n            raise ValueError(\"Cannot convert non-HuggingFace data\")\n        (\n            model,\n            data,\n            input_names,\n            output_structure,\n            output_type,\n        ) = convert_hf_model(\n            self.original_model,\n            self.original_data,\n            self.device,\n            **self.tokenizer_params,\n        )\n        self.__hf_model = model\n        self.__hf_data = data\n        self.__hf_input_names = input_names\n        self.__hf_output_type = output_type\n        self.__hf_output_structure = output_structure\n        self.__adapted = True\n\n    @property\n    def adapted_model(self):\n        if self.__adapted is False:\n            self.__adapt_model()\n        return self.__hf_model\n\n    @property\n    def adapted_data(self):\n        if self.__adapted is False:\n            self.__adapt_model()\n        return self.__hf_data\n\n    def adapt_inference_learner(\n        self, optimized_model: OptimizedModel\n    ) -> OptimizedModel:\n        from nebullvm.operations.inference_learners.huggingface import (\n            HuggingFaceInferenceLearner,\n        )\n\n        optimized_model.inference_learner = HuggingFaceInferenceLearner(\n            core_inference_learner=optimized_model.inference_learner,\n            output_structure=self.__hf_output_structure,\n            input_names=self.__hf_input_names,\n            output_type=self.__hf_output_type,\n        )\n\n        return optimized_model\n\n    def adapt_original_model(\n        self, original_model: OriginalModel\n    ) -> OriginalModel:\n        return original_model\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/benchmark.py",
    "content": "import time\nfrom abc import abstractmethod, ABC\nfrom typing import Any, Dict, Type\n\nimport numpy as np\nfrom loguru import logger\nfrom tqdm import tqdm\n\nfrom nebullvm.core.models import DeepLearningFramework, ModelParams, DeviceType\nfrom nebullvm.operations.inference_learners.base import BaseInferenceLearner\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch, DataLoader\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.onnx import create_model_inputs_onnx\nfrom nebullvm.tools.pytorch import create_model_inputs_torch\nfrom nebullvm.tools.tf import create_model_inputs_tf\nfrom nebullvm.tools.utils import (\n    check_input_data,\n    extract_info_from_data,\n    is_data_subscriptable,\n    check_device,\n)\n\n\ndef _get_dl_framework(model: Any):\n    if (\n        isinstance(model, torch.nn.Module)\n        or str(model).startswith(\"Pytorch\")\n        or str(model).startswith(\"Torch\")\n    ):\n        return DeepLearningFramework.PYTORCH\n    elif (isinstance(model, tf.Module) and model is not None) or str(\n        model\n    ).startswith(\"Tensorflow\"):\n        return DeepLearningFramework.TENSORFLOW\n    elif isinstance(model, str) or str(model).startswith(\"Numpy\"):\n        return DeepLearningFramework.NUMPY\n    else:\n        raise TypeError(f\"Model type {type(model)} not supported.\")\n\n\ndef _create_model_inputs(\n    dl_framework: DeepLearningFramework, model_params: ModelParams\n):\n    if dl_framework == DeepLearningFramework.PYTORCH:\n        input_data = create_model_inputs_torch(model_params.input_infos)\n    elif dl_framework == DeepLearningFramework.TENSORFLOW:\n        input_data = create_model_inputs_tf(model_params.input_infos)\n    elif dl_framework == DeepLearningFramework.NUMPY:\n        input_data = create_model_inputs_onnx(model_params.input_infos)\n    else:\n        raise TypeError(f\"Unknown framework {dl_framework}\")\n\n    return input_data\n\n\nclass BaseBenchmark(ABC):\n    def __init__(self, model, input_tensors, device, n_warmup=50, n_runs=1000):\n        self.model = model\n        self.input_tensors = input_tensors\n        self.device = device\n        self.n_warmup = n_warmup\n        self.n_runs = n_runs\n\n    @abstractmethod\n    def benchmark(self):\n        raise NotImplementedError\n\n\nclass PytorchBenchmark(BaseBenchmark):\n    def benchmark(self):\n        input_tensors = [\n            [tensor.to(self.device.to_torch_format()) for tensor in tensors]\n            for tensors in self.input_tensors\n        ]\n        batch_size = input_tensors[0][0].shape[0]\n\n        if isinstance(self.model, torch.nn.Module):\n            self.model.to(self.device.to_torch_format()).eval()\n\n        with torch.no_grad():\n            for i in tqdm(\n                range(self.n_warmup),\n                desc=f\"Performing warm up on {self.n_warmup} iterations\",\n            ):\n                self.model(\n                    *input_tensors[i % min(self.n_warmup, len(input_tensors))]\n                )\n        if self.device.type is DeviceType.GPU:\n            torch.cuda.synchronize()\n        timings = []\n        with torch.no_grad():\n            for i in tqdm(\n                range(1, self.n_runs + 1),\n                desc=f\"Performing benchmark on {self.n_runs} iterations\",\n            ):\n                start_time = time.time()\n                self.model(\n                    *input_tensors[i % min(self.n_runs, len(input_tensors))]\n                )\n                if self.device.type is DeviceType.GPU:\n                    torch.cuda.synchronize()\n                end_time = time.time()\n                timings.append(end_time - start_time)\n\n        print(f\"Batch size: {batch_size}\")\n\n        throughput = batch_size / np.mean(timings)\n        latency = np.mean(timings) / batch_size\n\n        print(\"Average Throughput: %.2f data/second\" % throughput)\n        print(\"Average Latency: %.4f seconds/data\" % latency)\n\n        return throughput, latency\n\n\nclass TensorflowBenchmark(BaseBenchmark):\n    def benchmark(self):\n        batch_size = self.input_tensors[0][0].shape[0]\n\n        for i in tqdm(\n            range(self.n_warmup),\n            desc=f\"Performing warm up on {self.n_warmup} iterations\",\n        ):\n            with tf.device(self.device.to_tf_format()):\n                self.model(\n                    *self.input_tensors[\n                        i % min(self.n_warmup, len(self.input_tensors))\n                    ]\n                )\n\n        timings = []\n        for i in tqdm(\n            range(1, self.n_runs + 1),\n            desc=f\"Performing benchmark on {self.n_runs} iterations\",\n        ):\n            start_time = time.time()\n            with tf.device(self.device.to_tf_format()):\n                self.model(\n                    *self.input_tensors[\n                        i % min(self.n_runs, len(self.input_tensors))\n                    ]\n                )\n\n            end_time = time.time()\n            timings.append(end_time - start_time)\n\n        print(f\"Batch size: {batch_size}\")\n\n        throughput = batch_size / np.mean(timings)\n        latency = np.mean(timings) / batch_size\n\n        print(\"Average Throughput: %.2f data/second\" % throughput)\n        print(\"Average Latency: %.4f seconds/data\" % latency)\n\n        return throughput, latency\n\n\nclass NumpyBenchmark(BaseBenchmark):\n    def benchmark(self):\n        if not isinstance(self.model, BaseInferenceLearner):\n            # TODO: Add support for original onnx models\n            raise NotImplementedError(\n                \"Benchmark function doesn't support original \" \"onnx models.\"\n            )\n        batch_size = self.input_tensors[0][0].shape[0]\n\n        for i in tqdm(\n            range(self.n_warmup),\n            desc=f\"Performing warm up on {self.n_warmup} iterations\",\n        ):\n            self.model(\n                *self.input_tensors[\n                    i % min(self.n_warmup, len(self.input_tensors))\n                ]\n            )\n\n        timings = []\n        for i in tqdm(\n            range(1, self.n_runs + 1),\n            desc=f\"Performing benchmark on {self.n_runs} iterations\",\n        ):\n            start_time = time.time()\n            self.model(\n                *self.input_tensors[\n                    i % min(self.n_runs, len(self.input_tensors))\n                ]\n            )\n\n            end_time = time.time()\n            timings.append(end_time - start_time)\n\n        print(f\"Batch size: {batch_size}\")\n\n        throughput = batch_size / np.mean(timings)\n        latency = np.mean(timings) / batch_size\n\n        print(\"Average Throughput: %.2f data/second\" % throughput)\n        print(\"Average Latency: %.4f seconds/data\" % latency)\n\n        return throughput, latency\n\n\ndef benchmark(\n    model, input_data, device=None, random=False, n_warmup=50, n_runs=1000\n):\n    \"\"\"Performs a Benchmark on the input model regardless of the framework it\n    was used for implementing it.\n    Args:\n        model (Any): The input model.\n        input_data (Iterable or Sequence): Input data to be used for\n            optimizing the model. PyTorch, TensorFlow\n            and Onnx respectively accept input tensor in `torch.Tensor`,\n            `tf.Tensor` and `np.ndarray` formats. Note that the each input\n            sample must be a tuple containing a tuple as first element, the\n            `inputs`, and the `label` as second element. The `inputs` needs to\n            be passed as tuple even if a single input is needed by the model\n            (in this case the `inputs` tuple will contain just an element).\n            HuggingFace models can take as data samples both dictionaries or\n            strings. Strings will then be converted in data samples using the\n            HuggingFace tokenizer which must be given as input when just a\n            list of string is provided as input_data (tokenizers can be passed\n            as extra arguments of this function using the keyword `tokenizer`).\n        device (str): Device to be used for running the benchmark. If None,\n            CPU will be used. Default: None.\n        random (bool, optional): If set to true, the data used to benchmark the\n            model will be computed randomly given the info extracted from the\n            provided input_data.\n        n_warmup (int, optional): Number of warmup iterations.\n        n_runs (int, optional): Number of iterations performed to benchmark\n            the model.\n    \"\"\"\n    if not isinstance(model, BaseInferenceLearner):\n        device = check_device(device)\n    else:\n        device = model.device\n\n    logger.info(f\"Running benchmark on {device.type.name}\")\n\n    dl_framework = _get_dl_framework(model)\n\n    if isinstance(input_data, (DataLoader, tf.data.Dataset)):\n        try:\n            input_data = DataManager.from_dataloader(input_data)\n        except Exception:\n            raise ValueError(\n                \"The provided dataloader does not match the expected \"\n                \"format.\\n\"\n                \"Speedster supports dataloaders that return tuples in \"\n                \"the\\n\"\n                \"following formats: \\n\"\n                \"Single input: (input,  label)\\n\"\n                \"Multiple inputs: ((input1, input2, ...),  label) or \"\n                \"(input1, input2, ...,  label)\\n\"\n                \"Inputs and labels should be either tensors or numpy \"\n                \"arrays,\\n\"\n                \"depending on the framework used.\\n\"\n            )\n\n    if not isinstance(input_data, DataManager):\n        if check_input_data(input_data):\n            if is_data_subscriptable(input_data):\n                input_data = DataManager(input_data)\n            else:\n                input_data = DataManager.from_iterable(input_data)\n        else:\n            raise ValueError(\n                \"The provided data does not match the expected \"\n                \"format.\\n\"\n                \"Speedster supports data in the following formats: \\n\"\n                \"- PyTorch DataLoader\\n\"\n                \"- TensorFlow Dataset\\n\"\n                \"- List of tuples: [((input_0, ... ), label), ...] \\n\"\n                \"Inputs and labels should be either tensors or numpy \"\n                \"arrays,\\n\"\n                \"depending on the framework used.\\n\"\n            )\n\n    if random:\n        model_params = extract_info_from_data(\n            model, input_data, dl_framework, None, device\n        )\n        input_data = _create_model_inputs(dl_framework, model_params)\n    else:\n        input_data = input_data.get_list()\n\n    BENCHMARK_FUNCTIONS[dl_framework](\n        model=model,\n        input_tensors=input_data,\n        device=device,\n        n_warmup=n_warmup,\n        n_runs=n_runs,\n    ).benchmark()\n\n\nBENCHMARK_FUNCTIONS: Dict[DeepLearningFramework, Type[BaseBenchmark]] = {\n    DeepLearningFramework.PYTORCH: PytorchBenchmark,\n    DeepLearningFramework.TENSORFLOW: TensorflowBenchmark,\n    DeepLearningFramework.NUMPY: NumpyBenchmark,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/data.py",
    "content": "from typing import Sequence, List, Tuple, Any, Union, Iterable\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.config import MIN_DIM_INPUT_DATA\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch, Dataset, DataLoader\nfrom nebullvm.tools.onnx import convert_to_numpy\n\n\nclass DataManager:\n    \"\"\"Class for managing the user data in nebullvm.\n\n    Attributes:\n        data_reader(Sequence): Object implementing the __getitem__, the\n            __len__ and the __iter__/__next__ APIs. It should read the\n            user data and return tuples of tensors for feeding the models.\n    \"\"\"\n\n    def __init__(self, data_reader: Sequence):\n        self._data_reader = data_reader\n        self._pointer = 0\n        self.train_idxs = []\n        self.test_idxs = []\n\n    def __getitem__(self, item):\n        return self._data_reader[item]\n\n    def __len__(self):\n        return len(self._data_reader)\n\n    def __iter__(self):\n        self._pointer = 0\n        return self\n\n    def __next__(self):\n        if self._pointer < len(self):\n            data = self[self._pointer]\n            self._pointer += 1\n            return data\n        else:\n            raise StopIteration\n\n    def get_numpy_list(\n        self, n: int = None, shuffle: bool = False, with_ys: bool = False\n    ) -> Union[\n        List[Tuple[np.ndarray, ...]], Tuple[List[Tuple[np.ndarray, ...]], List]\n    ]:\n        if n is None:\n            n = len(self)\n        if not with_ys:\n            return [\n                tuple(convert_to_numpy(x) for x in tuple_)\n                for tuple_ in self.get_list(n, shuffle)\n            ]\n        else:\n            xs, ys = self.get_list(n, shuffle, with_ys=True)\n            return [\n                tuple(convert_to_numpy(x) for x in tuple_) for tuple_ in xs\n            ], ys\n\n    def get_list(\n        self, n: int = None, shuffle: bool = False, with_ys: bool = False\n    ) -> Union[List[Tuple[Any, ...]], Tuple[List[Tuple[Any, ...]], List]]:\n        if n is None:\n            n = len(self)\n        if shuffle:\n            idx = np.random.choice(len(self), n, replace=n > len(self))\n        else:\n            idx = np.arange(0, min(n, len(self)))\n            if n > len(self):\n                np.random.seed(0)\n                idx = np.concatenate(\n                    [\n                        idx,\n                        np.random.choice(\n                            len(self), n - len(self), replace=True\n                        ),\n                    ]\n                )\n        if not with_ys:\n            return [self[i][0] for i in idx]\n\n        ys, xs = [], []\n        for i in idx:\n            x, y = self[i] if len(self[i]) > 1 else (self[i][0], None)\n            xs.append(x)\n            ys.append(y)\n        return xs, ys\n\n    @classmethod\n    def from_iterable(cls, iterable: Iterable, max_length: int = 500):\n        return cls([x for i, x in enumerate(iterable) if i < max_length])\n\n    @classmethod\n    def from_dataloader(\n        cls,\n        dataloader: Union[DataLoader, tf.data.Dataset],\n        max_length: int = 500,\n    ):\n        batch_size = (\n            dataloader.batch_size\n            if isinstance(dataloader, DataLoader)\n            else dataloader._batch_size\n        )\n\n        if batch_size > max_length:\n            raise ValueError(\n                f\"Batch size ({dataloader.batch_size}) is greater than \"\n                f\"max_length ({max_length}).\"\n            )\n        data_manager = []\n        warning_label = False\n        for i, batch in enumerate(dataloader):\n            if i * batch_size >= max_length:\n                break\n\n            if isinstance(batch, (list, tuple)):\n                if len(batch) == 1:\n                    data_manager.append((batch, None))\n                elif len(batch) == 2:\n                    if isinstance(batch[0], tuple):\n                        data_manager.append((batch[0], batch[1]))\n                    elif isinstance(batch[0], (torch.Tensor, tf.Tensor)):\n                        warning_label = True\n                        data_manager.append(((batch[0],), batch[1]))\n                    else:\n                        raise ValueError(\n                            \"The first element of the batch should be a \"\n                            \"tuple or a torch.Tensor\"\n                        )\n                else:\n                    warning_label = True\n                    data_manager.append(\n                        (tuple(t for t in batch[:-1]), batch[-1])\n                    )\n            elif isinstance(batch, (torch.Tensor, tf.Tensor)):\n                data_manager.append(((batch,), None))\n            else:\n                raise ValueError(\n                    \"The batch should be a tuple, a list or a Tensor\"\n                )\n\n        if warning_label:\n            logger.warning(\n                \"The provided dataloader returns a tuple of tensors\"\n                \"for each batch. The last tensor in the tuple will \"\n                \"be considered as the label. \"\n                \"To avoid this warning, the dataloader should return \"\n                \"a tuple for each batch, where the first element is \"\n                \"a tuple containing the inputs and the second element \"\n                \"is a tensor containing the label.\"\n            )\n\n        return cls(data_manager)\n\n    def get_split(self, split_type=\"train\"):\n        return (\n            DataManager([self[i] for i in self.train_idxs])\n            if split_type == \"train\"\n            else DataManager([self[i] for i in self.test_idxs])\n        )\n\n    def split(self, split_pct: float, shuffle: bool = False):\n        if shuffle:\n            idx = np.random.choice(len(self), len(self), replace=False)\n        else:\n            idx = np.arange(len(self))\n\n        n = int(round(len(idx) * split_pct))\n\n        if len(self) < MIN_DIM_INPUT_DATA:\n            logger.warning(\n                f\"Not enough data for splitting the DataManager. \"\n                f\"You should provide at least {MIN_DIM_INPUT_DATA} \"\n                f\"data samples to allow a good split between train \"\n                f\"and test sets. Compression, calibration and precision \"\n                f\"checks will use the same data.\"\n            )\n            self.train_idxs = idx\n            self.test_idxs = idx\n        else:\n            self.train_idxs = idx[:n]\n            self.test_idxs = idx[n:]\n\n\nclass PytorchDataset(Dataset):\n    def __init__(self, input_data: DataManager, has_labels: bool = False):\n        self.data = input_data\n        self.has_labels = has_labels\n        self.batch_size = input_data[0][0][0].shape[0]\n\n    def __len__(self):\n        return sum([batch_inputs[0].shape[0] for batch_inputs, _ in self.data])\n\n    def __getitem__(self, idx):\n        batch_idx = int(idx / self.batch_size)\n        item_idx = idx % self.batch_size\n        data = tuple([data[item_idx] for data in self.data[batch_idx][0]])\n\n        if self.has_labels:\n            label = self.data[batch_idx][1]\n            if label is not None:\n                return data, self.data[batch_idx][1][item_idx]\n            else:\n                return data, torch.tensor([0])\n        else:\n            return data\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/diffusers.py",
    "content": "# Based on https://github.com/NVIDIA/TensorRT/blob/main/demo/Diffusion/models.py\n#\n#\n# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n# SPDX-License-Identifier: Apache-2.0\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom typing import Dict, Union, List, Optional, Any, Tuple\n\nfrom nebullvm.core.models import Device\nfrom nebullvm.optional_modules.diffusers import (\n    DiffusionPipeline,\n    UNet2DConditionModel,\n    UNet2DOutput,\n    AutoencoderKL,\n    onnx_graphsurgeon as gs,\n)\nfrom nebullvm.optional_modules.diffusers import StableDiffusionPipeline\nfrom nebullvm.optional_modules.huggingface import CLIPTextModel, CLIPTokenizer\nfrom nebullvm.optional_modules.onnx import onnx\nfrom nebullvm.optional_modules.tensor_rt import fold_constants\nfrom nebullvm.optional_modules.torch import torch\n\n\n@torch.no_grad()\ndef get_unet_inputs(\n    self,\n    prompt: Union[str, List[str]] = None,\n    height: Optional[int] = None,\n    width: Optional[int] = None,\n    num_inference_steps: int = 1,\n    guidance_scale: float = 7.5,\n    negative_prompt: Optional[Union[str, List[str]]] = None,\n    num_images_per_prompt: Optional[int] = 1,\n    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,\n    latents: Optional[torch.FloatTensor] = None,\n    prompt_embeds: Optional[torch.FloatTensor] = None,\n    negative_prompt_embeds: Optional[torch.FloatTensor] = None,\n    callback_steps: int = 1,\n    cross_attention_kwargs: Optional[Dict[str, Any]] = None,\n):\n    # 0. Default height and width to unet\n    height = height or self.unet.config.sample_size * self.vae_scale_factor\n    width = width or self.unet.config.sample_size * self.vae_scale_factor\n\n    # 1. Check inputs. Raise error if not correct\n    self.check_inputs(\n        prompt,\n        height,\n        width,\n        callback_steps,\n        negative_prompt,\n        prompt_embeds,\n        negative_prompt_embeds,\n    )\n\n    # 2. Define call parameters\n    if prompt is not None and isinstance(prompt, str):\n        batch_size = 1\n    elif prompt is not None and isinstance(prompt, list):\n        batch_size = len(prompt)\n    else:\n        batch_size = prompt_embeds.shape[0]\n\n    device = self._execution_device\n    do_classifier_free_guidance = guidance_scale > 1.0\n\n    # 3. Encode input prompt\n    prompt_embeds = self._encode_prompt(\n        prompt,\n        device,\n        num_images_per_prompt,\n        do_classifier_free_guidance,\n        negative_prompt,\n        prompt_embeds=prompt_embeds,\n        negative_prompt_embeds=negative_prompt_embeds,\n    )\n\n    # 4. Prepare timesteps\n    self.scheduler.set_timesteps(num_inference_steps, device=device)\n    timesteps = self.scheduler.timesteps\n\n    # 5. Prepare latent variables\n    num_channels_latents = self.unet.in_channels\n    latents = self.prepare_latents(\n        batch_size * num_images_per_prompt,\n        num_channels_latents,\n        height,\n        width,\n        prompt_embeds.dtype,\n        device,\n        generator,\n        latents,\n    )\n\n    for i, t in enumerate(timesteps):\n        # expand the latents if we are doing classifier free guidance\n        latent_model_input = (\n            torch.cat([latents] * 2)\n            if do_classifier_free_guidance\n            else latents\n        )\n        latent_model_input = self.scheduler.scale_model_input(\n            latent_model_input, t\n        )\n\n        return latent_model_input, t, prompt_embeds, cross_attention_kwargs\n\n\nclass DiffusionUNetWrapper(torch.nn.Module):\n    def __init__(self, model):\n        super().__init__()\n        self.model = model\n\n    def forward(self, *x, **kwargs):\n        return tuple(\n            self.model(x[0], x[1], encoder_hidden_states=x[2]).values()\n        )\n\n\nclass OptimizedDiffusionWrapper(torch.nn.Module):\n    def __init__(self, model):\n        super().__init__()\n        self.model = model\n\n    def forward(self, *x, **kwargs):\n        return UNet2DOutput(\n            self.model(\n                x[0],\n                x[1].reshape((1,)) if x[1].shape == torch.Size([]) else x[1],\n                kwargs[\"encoder_hidden_states\"],\n            )[0]\n        )\n\n\ndef is_diffusion_model_pipe(model):\n    return isinstance(model, DiffusionPipeline)\n\n\ndef get_default_dynamic_info(input_shape: List[Tuple[int, ...]]):\n    return {\n        \"inputs\": [\n            {\n                0: {\n                    \"name\": \"2B\",\n                    \"min_val\": input_shape[0][0],\n                    \"opt_val\": input_shape[0][0],\n                    \"max_val\": input_shape[0][0],\n                },\n                2: {\n                    \"name\": \"H\",\n                    \"min_val\": input_shape[0][2],\n                    \"opt_val\": input_shape[0][2],\n                    \"max_val\": input_shape[0][2],\n                },\n                3: {\n                    \"name\": \"W\",\n                    \"min_val\": input_shape[0][3],\n                    \"opt_val\": input_shape[0][3],\n                    \"max_val\": input_shape[0][3],\n                },\n            },\n            {},\n            {\n                0: {\n                    \"name\": \"2B\",\n                    \"min_val\": input_shape[2][0],\n                    \"opt_val\": input_shape[2][0],\n                    \"max_val\": input_shape[2][0],\n                }\n            },\n        ],\n        \"outputs\": [{0: \"2B\", 2: \"H\", 3: \"W\"}],\n    }\n\n\ndef preprocess_diffusers(pipe: DiffusionPipeline) -> torch.nn.Module:\n    # Function that wraps the Diffusion UNet model to\n    # be compatible with the optimizations performed by nebullvm\n    model = DiffusionUNetWrapper(pipe.unet)\n    return model\n\n\ndef postprocess_diffusers(\n    optimized_model: Any,\n    pipe: StableDiffusionPipeline,\n    device: Device,\n) -> StableDiffusionPipeline:\n    # Function that puts the optimized Diffusion UNet model back\n    # into the Diffusion Pipeline\n    final_model = OptimizedDiffusionWrapper(optimized_model)\n    final_model.sample_size = pipe.unet.sample_size\n    final_model.in_channels = pipe.unet.in_channels\n    final_model.device = torch.device(device.to_torch_format())\n    final_model.config = pipe.unet.config\n    final_model.in_channels = pipe.unet.in_channels\n    pipe.unet = final_model\n    return pipe\n\n\nclass Optimizer:\n    def __init__(self, onnx_graph, verbose=False):\n        self.graph = gs.import_onnx(onnx_graph)\n        self.verbose = verbose\n\n    def info(self, prefix):\n        if self.verbose:\n            print(\n                f\"{prefix} .. {len(self.graph.nodes)} nodes, {len(self.graph.tensors().keys())} tensors, {len(self.graph.inputs)} inputs, {len(self.graph.outputs)} outputs\"\n            )\n\n    def cleanup(self, return_onnx=False):\n        self.graph.cleanup().toposort()\n        if return_onnx:\n            return gs.export_onnx(self.graph)\n\n    def select_outputs(self, keep, names=None):\n        self.graph.outputs = [self.graph.outputs[o] for o in keep]\n        if names:\n            for i, name in enumerate(names):\n                self.graph.outputs[i].name = name\n\n    def fold_constants(self, return_onnx=False):\n        onnx_graph = fold_constants(\n            gs.export_onnx(self.graph),\n            allow_onnxruntime_shape_inference=True,\n        )\n        self.graph = gs.import_onnx(onnx_graph)\n        if return_onnx:\n            return onnx_graph\n\n    def infer_shapes(self, return_onnx=False):\n        onnx_graph = gs.export_onnx(self.graph)\n        if onnx_graph.ByteSize() > 2147483648:\n            raise TypeError(\"ERROR: model size exceeds supported 2GB limit\")\n        else:\n            onnx_graph = onnx.shape_inference.infer_shapes(onnx_graph)\n\n        self.graph = gs.import_onnx(onnx_graph)\n        if return_onnx:\n            return onnx_graph\n\n\ndef get_path(version, inpaint=False):\n    if version == \"1.4\":\n        if inpaint:\n            return \"runwayml/stable-diffusion-inpainting\"\n        else:\n            return \"CompVis/stable-diffusion-v1-4\"\n    elif version == \"1.5\":\n        if inpaint:\n            return \"runwayml/stable-diffusion-inpainting\"\n        else:\n            return \"runwayml/stable-diffusion-v1-5\"\n    elif version == \"2.0-base\":\n        if inpaint:\n            return \"stabilityai/stable-diffusion-2-inpainting\"\n        else:\n            return \"stabilityai/stable-diffusion-2-base\"\n    elif version == \"2.0\":\n        if inpaint:\n            return \"stabilityai/stable-diffusion-2-inpainting\"\n        else:\n            return \"stabilityai/stable-diffusion-2\"\n    elif version == \"2.1\":\n        return \"stabilityai/stable-diffusion-2-1\"\n    elif version == \"2.1-base\":\n        return \"stabilityai/stable-diffusion-2-1-base\"\n    else:\n        raise ValueError(f\"Incorrect version {version}\")\n\n\ndef get_embedding_dim(version):\n    if version in (\"1.4\", \"1.5\"):\n        return 768\n    elif version in (\"2.0\", \"2.0-base\", \"2.1\", \"2.1-base\"):\n        return 1024\n    else:\n        raise ValueError(f\"Incorrect version {version}\")\n\n\nclass BaseModel:\n    def __init__(\n        self,\n        hf_token,\n        fp16=False,\n        device=\"cuda\",\n        verbose=False,\n        path=\"\",\n        max_batch_size=16,\n        embedding_dim=768,\n        text_maxlen=77,\n    ):\n        self.name = \"SD Model\"\n        self.hf_token = hf_token\n        self.fp16 = fp16\n        self.device = device\n        self.verbose = verbose\n        self.path = path\n\n        self.min_batch = 1\n        self.max_batch = max_batch_size\n        self.min_image_shape = 256  # min image resolution: 256x256\n        self.max_image_shape = 1024  # max image resolution: 1024x1024\n        self.min_latent_shape = self.min_image_shape // 8\n        self.max_latent_shape = self.max_image_shape // 8\n\n        self.embedding_dim = embedding_dim\n        self.text_maxlen = text_maxlen\n\n    def get_model(self):\n        pass\n\n    def get_input_names(self):\n        pass\n\n    def get_output_names(self):\n        pass\n\n    def get_dynamic_axes(self):\n        return None\n\n    def get_sample_input(self, batch_size, image_height, image_width):\n        pass\n\n    def get_input_profile(\n        self, batch_size, image_height, image_width, static_batch, static_shape\n    ):\n        return None\n\n    def get_shape_dict(self, batch_size, image_height, image_width):\n        return None\n\n    def optimize(self, onnx_graph):\n        opt = Optimizer(onnx_graph, verbose=self.verbose)\n        opt.info(self.name + \": original\")\n        opt.cleanup()\n        opt.info(self.name + \": cleanup\")\n        opt.fold_constants()\n        opt.info(self.name + \": fold constants\")\n        opt.infer_shapes()\n        opt.info(self.name + \": shape inference\")\n        onnx_opt_graph = opt.cleanup(return_onnx=True)\n        opt.info(self.name + \": finished\")\n        return onnx_opt_graph\n\n    def check_dims(self, batch_size, image_height, image_width):\n        assert batch_size >= self.min_batch and batch_size <= self.max_batch\n        assert image_height % 8 == 0 or image_width % 8 == 0\n        latent_height = image_height // 8\n        latent_width = image_width // 8\n        assert (\n            latent_height >= self.min_latent_shape\n            and latent_height <= self.max_latent_shape\n        )\n        assert (\n            latent_width >= self.min_latent_shape\n            and latent_width <= self.max_latent_shape\n        )\n        return (latent_height, latent_width)\n\n    def get_minmax_dims(\n        self, batch_size, image_height, image_width, static_batch, static_shape\n    ):\n        min_batch = batch_size if static_batch else self.min_batch\n        max_batch = batch_size if static_batch else self.max_batch\n        latent_height = image_height // 8\n        latent_width = image_width // 8\n        min_image_height = (\n            image_height if static_shape else self.min_image_shape\n        )\n        max_image_height = (\n            image_height if static_shape else self.max_image_shape\n        )\n        min_image_width = image_width if static_shape else self.min_image_shape\n        max_image_width = image_width if static_shape else self.max_image_shape\n        min_latent_height = (\n            latent_height if static_shape else self.min_latent_shape\n        )\n        max_latent_height = (\n            latent_height if static_shape else self.max_latent_shape\n        )\n        min_latent_width = (\n            latent_width if static_shape else self.min_latent_shape\n        )\n        max_latent_width = (\n            latent_width if static_shape else self.max_latent_shape\n        )\n        return (\n            min_batch,\n            max_batch,\n            min_image_height,\n            max_image_height,\n            min_image_width,\n            max_image_width,\n            min_latent_height,\n            max_latent_height,\n            min_latent_width,\n            max_latent_width,\n        )\n\n\nclass CLIP(BaseModel):\n    def __init__(\n        self, hf_token, device, verbose, path, max_batch_size, embedding_dim\n    ):\n        super(CLIP, self).__init__(\n            hf_token,\n            device=device,\n            verbose=verbose,\n            path=path,\n            max_batch_size=max_batch_size,\n            embedding_dim=embedding_dim,\n        )\n        self.name = \"CLIP\"\n\n    def get_model(self):\n        return CLIPTextModel.from_pretrained(\n            self.path, subfolder=\"text_encoder\", use_auth_token=self.hf_token\n        ).to(self.device)\n\n    def get_input_names(self):\n        return [\"input_ids\"]\n\n    def get_output_names(self):\n        return [\"text_embeddings\", \"pooler_output\"]\n\n    def get_dynamic_axes(self):\n        return {\"input_ids\": {0: \"B\"}, \"text_embeddings\": {0: \"B\"}}\n\n    def get_input_profile(\n        self, batch_size, image_height, image_width, static_batch, static_shape\n    ):\n        self.check_dims(batch_size, image_height, image_width)\n        min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(\n            batch_size, image_height, image_width, static_batch, static_shape\n        )\n        return {\n            \"input_ids\": [\n                (min_batch, self.text_maxlen),\n                (batch_size, self.text_maxlen),\n                (max_batch, self.text_maxlen),\n            ]\n        }\n\n    def get_shape_dict(self, batch_size, image_height, image_width):\n        self.check_dims(batch_size, image_height, image_width)\n        return {\n            \"input_ids\": (batch_size, self.text_maxlen),\n            \"text_embeddings\": (\n                batch_size,\n                self.text_maxlen,\n                self.embedding_dim,\n            ),\n        }\n\n    def get_sample_input(self, batch_size, image_height, image_width):\n        self.check_dims(batch_size, image_height, image_width)\n        return torch.zeros(\n            batch_size, self.text_maxlen, dtype=torch.int32, device=self.device\n        )\n\n    def optimize(self, onnx_graph):\n        opt = Optimizer(onnx_graph, verbose=self.verbose)\n        opt.info(self.name + \": original\")\n        opt.select_outputs([0])  # delete graph output#1\n        opt.cleanup()\n        opt.info(self.name + \": remove output[1]\")\n        opt.fold_constants()\n        opt.info(self.name + \": fold constants\")\n        opt.infer_shapes()\n        opt.info(self.name + \": shape inference\")\n        opt.select_outputs(\n            [0], names=[\"text_embeddings\"]\n        )  # rename network output\n        opt.info(self.name + \": remove output[0]\")\n        opt_onnx_graph = opt.cleanup(return_onnx=True)\n        opt.info(self.name + \": finished\")\n        return opt_onnx_graph\n\n\ndef make_CLIP(\n    version, hf_token, device, verbose, max_batch_size, inpaint=False\n):\n    return CLIP(\n        hf_token=hf_token,\n        device=device,\n        verbose=verbose,\n        path=get_path(version, inpaint=inpaint),\n        max_batch_size=max_batch_size,\n        embedding_dim=get_embedding_dim(version),\n    )\n\n\nclass UNet(BaseModel):\n    def __init__(\n        self,\n        hf_token,\n        fp16=False,\n        device=\"cuda\",\n        verbose=False,\n        path=\"\",\n        max_batch_size=16,\n        embedding_dim=768,\n        text_maxlen=77,\n        unet_dim=4,\n    ):\n        super(UNet, self).__init__(\n            hf_token,\n            fp16=fp16,\n            device=device,\n            verbose=verbose,\n            path=path,\n            max_batch_size=max_batch_size,\n            embedding_dim=embedding_dim,\n            text_maxlen=text_maxlen,\n        )\n        self.unet_dim = unet_dim\n        self.name = \"UNet\"\n\n    def get_model(self):\n        model_opts = (\n            {\"revision\": \"fp16\", \"torch_dtype\": torch.float16}\n            if self.fp16\n            else {}\n        )\n        return UNet2DConditionModel.from_pretrained(\n            self.path,\n            subfolder=\"unet\",\n            use_auth_token=self.hf_token,\n            **model_opts,\n        ).to(self.device)\n\n    def get_input_names(self):\n        return [\"sample\", \"timestep\", \"encoder_hidden_states\"]\n\n    def get_output_names(self):\n        return [\"latent\"]\n\n    def get_dynamic_axes(self):\n        return {\n            \"sample\": {0: \"2B\", 2: \"H\", 3: \"W\"},\n            \"encoder_hidden_states\": {0: \"2B\"},\n            \"latent\": {0: \"2B\", 2: \"H\", 3: \"W\"},\n        }\n\n    def get_input_profile(\n        self, batch_size, image_height, image_width, static_batch, static_shape\n    ):\n        latent_height, latent_width = self.check_dims(\n            batch_size, image_height, image_width\n        )\n        (\n            min_batch,\n            max_batch,\n            _,\n            _,\n            _,\n            _,\n            min_latent_height,\n            max_latent_height,\n            min_latent_width,\n            max_latent_width,\n        ) = self.get_minmax_dims(\n            batch_size, image_height, image_width, static_batch, static_shape\n        )\n        return {\n            \"sample\": [\n                (\n                    2 * min_batch,\n                    self.unet_dim,\n                    min_latent_height,\n                    min_latent_width,\n                ),\n                (2 * batch_size, self.unet_dim, latent_height, latent_width),\n                (\n                    2 * max_batch,\n                    self.unet_dim,\n                    max_latent_height,\n                    max_latent_width,\n                ),\n            ],\n            \"encoder_hidden_states\": [\n                (2 * min_batch, self.text_maxlen, self.embedding_dim),\n                (2 * batch_size, self.text_maxlen, self.embedding_dim),\n                (2 * max_batch, self.text_maxlen, self.embedding_dim),\n            ],\n        }\n\n    def get_shape_dict(self, batch_size, image_height, image_width):\n        latent_height, latent_width = self.check_dims(\n            batch_size, image_height, image_width\n        )\n        return {\n            \"sample\": (\n                2 * batch_size,\n                self.unet_dim,\n                latent_height,\n                latent_width,\n            ),\n            \"encoder_hidden_states\": (\n                2 * batch_size,\n                self.text_maxlen,\n                self.embedding_dim,\n            ),\n            \"latent\": (2 * batch_size, 4, latent_height, latent_width),\n        }\n\n    def get_sample_input(self, batch_size, image_height, image_width):\n        latent_height, latent_width = self.check_dims(\n            batch_size, image_height, image_width\n        )\n        dtype = torch.float16 if self.fp16 else torch.float32\n        return (\n            torch.randn(\n                2 * batch_size,\n                self.unet_dim,\n                latent_height,\n                latent_width,\n                dtype=torch.float32,\n                device=self.device,\n            ),\n            torch.tensor([1.0], dtype=torch.float32, device=self.device),\n            torch.randn(\n                2 * batch_size,\n                self.text_maxlen,\n                self.embedding_dim,\n                dtype=dtype,\n                device=self.device,\n            ),\n        )\n\n\ndef make_UNet(\n    version, hf_token, device, verbose, max_batch_size, inpaint=False\n):\n    return UNet(\n        hf_token=hf_token,\n        fp16=True,\n        device=device,\n        verbose=verbose,\n        path=get_path(version, inpaint=inpaint),\n        max_batch_size=max_batch_size,\n        embedding_dim=get_embedding_dim(version),\n        unet_dim=(9 if inpaint else 4),\n    )\n\n\nclass VAE(BaseModel):\n    def __init__(\n        self, hf_token, device, verbose, path, max_batch_size, embedding_dim\n    ):\n        super(VAE, self).__init__(\n            hf_token,\n            device=device,\n            verbose=verbose,\n            path=path,\n            max_batch_size=max_batch_size,\n            embedding_dim=embedding_dim,\n        )\n        self.name = \"VAE decoder\"\n\n    def get_model(self):\n        vae = AutoencoderKL.from_pretrained(\n            self.path, subfolder=\"vae\", use_auth_token=self.hf_token\n        ).to(self.device)\n        vae.forward = vae.decode\n        return vae\n\n    def get_input_names(self):\n        return [\"latent\"]\n\n    def get_output_names(self):\n        return [\"images\"]\n\n    def get_dynamic_axes(self):\n        return {\n            \"latent\": {0: \"B\", 2: \"H\", 3: \"W\"},\n            \"images\": {0: \"B\", 2: \"8H\", 3: \"8W\"},\n        }\n\n    def get_input_profile(\n        self, batch_size, image_height, image_width, static_batch, static_shape\n    ):\n        latent_height, latent_width = self.check_dims(\n            batch_size, image_height, image_width\n        )\n        (\n            min_batch,\n            max_batch,\n            _,\n            _,\n            _,\n            _,\n            min_latent_height,\n            max_latent_height,\n            min_latent_width,\n            max_latent_width,\n        ) = self.get_minmax_dims(\n            batch_size, image_height, image_width, static_batch, static_shape\n        )\n        return {\n            \"latent\": [\n                (min_batch, 4, min_latent_height, min_latent_width),\n                (batch_size, 4, latent_height, latent_width),\n                (max_batch, 4, max_latent_height, max_latent_width),\n            ]\n        }\n\n    def get_shape_dict(self, batch_size, image_height, image_width):\n        latent_height, latent_width = self.check_dims(\n            batch_size, image_height, image_width\n        )\n        return {\n            \"latent\": (batch_size, 4, latent_height, latent_width),\n            \"images\": (batch_size, 3, image_height, image_width),\n        }\n\n    def get_sample_input(self, batch_size, image_height, image_width):\n        latent_height, latent_width = self.check_dims(\n            batch_size, image_height, image_width\n        )\n        return torch.randn(\n            batch_size,\n            4,\n            latent_height,\n            latent_width,\n            dtype=torch.float32,\n            device=self.device,\n        )\n\n\ndef make_VAE(\n    version, hf_token, device, verbose, max_batch_size, inpaint=False\n):\n    return VAE(\n        hf_token=hf_token,\n        device=device,\n        verbose=verbose,\n        path=get_path(version, inpaint=inpaint),\n        max_batch_size=max_batch_size,\n        embedding_dim=get_embedding_dim(version),\n    )\n\n\nclass TorchVAEEncoder(torch.nn.Module):\n    def __init__(self, token, device, path):\n        super().__init__()\n        self.path = path\n        self.vae_encoder = AutoencoderKL.from_pretrained(\n            self.path, subfolder=\"vae\", use_auth_token=token\n        ).to(device)\n\n    def forward(self, x):\n        return self.vae_encoder.encode(x).latent_dist.sample()\n\n\nclass VAEEncoder(BaseModel):\n    def __init__(\n        self, hf_token, device, verbose, path, max_batch_size, embedding_dim\n    ):\n        super(VAEEncoder, self).__init__(\n            hf_token,\n            device=device,\n            verbose=verbose,\n            path=path,\n            max_batch_size=max_batch_size,\n            embedding_dim=embedding_dim,\n        )\n        self.name = \"VAE encoder\"\n\n    def get_model(self):\n        vae_encoder = TorchVAEEncoder(self.hf_token, self.device, self.path)\n        return vae_encoder\n\n    def get_input_names(self):\n        return [\"images\"]\n\n    def get_output_names(self):\n        return [\"latent\"]\n\n    def get_dynamic_axes(self):\n        return {\n            \"images\": {0: \"B\", 2: \"8H\", 3: \"8W\"},\n            \"latent\": {0: \"B\", 2: \"H\", 3: \"W\"},\n        }\n\n    def get_input_profile(\n        self, batch_size, image_height, image_width, static_batch, static_shape\n    ):\n        assert batch_size >= self.min_batch and batch_size <= self.max_batch\n        min_batch = batch_size if static_batch else self.min_batch\n        max_batch = batch_size if static_batch else self.max_batch\n        self.check_dims(batch_size, image_height, image_width)\n        (\n            min_batch,\n            max_batch,\n            min_image_height,\n            max_image_height,\n            min_image_width,\n            max_image_width,\n            _,\n            _,\n            _,\n            _,\n        ) = self.get_minmax_dims(\n            batch_size, image_height, image_width, static_batch, static_shape\n        )\n\n        return {\n            \"images\": [\n                (min_batch, 3, min_image_height, min_image_width),\n                (batch_size, 3, image_height, image_width),\n                (max_batch, 3, max_image_height, max_image_width),\n            ],\n        }\n\n    def get_shape_dict(self, batch_size, image_height, image_width):\n        latent_height, latent_width = self.check_dims(\n            batch_size, image_height, image_width\n        )\n        return {\n            \"images\": (batch_size, 3, image_height, image_width),\n            \"latent\": (batch_size, 4, latent_height, latent_width),\n        }\n\n    def get_sample_input(self, batch_size, image_height, image_width):\n        self.check_dims(batch_size, image_height, image_width)\n        return torch.randn(\n            batch_size,\n            3,\n            image_height,\n            image_width,\n            dtype=torch.float32,\n            device=self.device,\n        )\n\n\ndef make_VAEEncoder(\n    version, hf_token, device, verbose, max_batch_size, inpaint=False\n):\n    return VAEEncoder(\n        hf_token=hf_token,\n        device=device,\n        verbose=verbose,\n        path=get_path(version, inpaint=inpaint),\n        max_batch_size=max_batch_size,\n        embedding_dim=get_embedding_dim(version),\n    )\n\n\ndef make_tokenizer(version, hf_token):\n    return CLIPTokenizer.from_pretrained(\n        get_path(version), subfolder=\"tokenizer\", use_auth_token=hf_token\n    )\n\n\ndef is_diffusion_model(model) -> bool:\n    try:\n        from diffusers import UNet2DConditionModel\n    except ImportError:\n        return False\n\n    if is_diffusion_model_pipe(model):\n        return True\n    if isinstance(model, (UNet2DConditionModel, DiffusionUNetWrapper)):\n        return True\n    if hasattr(model, \"model\"):\n        return isinstance(model.model, UNet2DConditionModel)\n    return False\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/feedback_collector.py",
    "content": "import json\nimport os\nfrom pathlib import Path\nfrom typing import Any\n\nimport requests\n\nfrom nebullvm.config import VERSION\n\nNEBULLVM_METADATA_PATH = Path.home() / \".nebullvm/collect.json\"\n\n\nclass FeedbackCollector:\n    def __init__(\n        self, url: str, disable_telemetry_environ_var: str, app_version: str\n    ):\n        self._disable_telemetry_environ_var = disable_telemetry_environ_var\n        self._is_active = (\n            int(os.getenv(disable_telemetry_environ_var, \"0\")) == 0\n        )\n        self._url = url\n        self._metadata = {\n            \"nebullvm_version\": VERSION,\n            \"app_version\": app_version,\n        }\n\n    def _store_ip_address(self):\n        try:\n            self._metadata[\"ip_address\"] = requests.get(\n                \"https://api.ipify.org\"\n            ).text\n        except Exception:\n            self._metadata[\"ip_address\"] = \"Unknown\"\n\n    @property\n    def is_active(self):\n        return self._is_active\n\n    def _inform_user(self):\n        message = (\n            f\"Nebuly collects anonymous usage statistics to help improve the \"\n            f\"product. You can opt-out by setting the environment variable \"\n            f\"{self._disable_telemetry_environ_var}=1.\"\n        )\n        print(message)\n\n    def store_info(self, key: str, value: Any):\n        if key in self._metadata and isinstance(value, list):\n            self._metadata[key] += value\n        else:\n            self._metadata[key] = value\n\n    def send_feedback(self, timeout: int = 30):\n        if not self.is_active:\n            return {}\n        self._store_ip_address()\n        request_body = self._metadata\n        headers = {\n            \"accept\": \"application/json\",\n            \"Content-Type\": \"application/json\",\n        }\n        response = requests.post(\n            self._url,\n            data=json.dumps(request_body),\n            headers=headers,\n            timeout=timeout,\n        )\n        return response\n\n    def get(self, key: str, default: Any = None):\n        return self._metadata.get(key, default)\n\n    def reset(self, key: str):\n        self._metadata.pop(key, None)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/hardware_utils.py",
    "content": "import os\nimport platform\n\nimport cpuinfo\nimport psutil\n\nfrom nebullvm.core.models import HardwareSetup, Device, DeviceType\nfrom nebullvm.optional_modules.torch_xla import xm\nfrom nebullvm.optional_modules.utils import (\n    torch_is_available,\n    tensorflow_is_available,\n)\nfrom nebullvm.tools.pytorch import torch_get_device_name\nfrom nebullvm.tools.tf import tensorflow_get_gpu_name\nfrom nebullvm.tools.utils import (\n    gpu_is_available,\n    tpu_is_available,\n    neuron_is_available,\n)\n\n\ndef get_hw_setup(device: Device = None) -> HardwareSetup:\n    accelerator = None\n    if (\n        device is not None and device.type is DeviceType.GPU\n    ) or gpu_is_available():\n        accelerator = _get_gpu_name()\n    elif (\n        device is not None and device.type is DeviceType.TPU\n    ) or tpu_is_available():\n        accelerator = _get_tpu_device_name()\n    elif (\n        device is not None and device.type is DeviceType.NEURON\n    ) or neuron_is_available():\n        accelerator = _get_neuron_device_name()\n    return HardwareSetup(\n        cpu=cpuinfo.get_cpu_info()[\"brand_raw\"],\n        operating_system=platform.system(),\n        memory_gb=round(psutil.virtual_memory().total * 1e-9, 2),\n        accelerator=accelerator,\n    )\n\n\ndef _get_gpu_name() -> str:\n    if torch_is_available():\n        name = torch_get_device_name()\n    elif tensorflow_is_available():\n        name = tensorflow_get_gpu_name()\n    else:\n        name = \"Unknown\"\n    return name\n\n\ndef _get_neuron_device_name() -> str:\n    output = os.popen(\"lshw -businfo\").read()\n    neuron_name = \"Unknown Neuron\"\n    for line in output.splitlines():\n        if \"neuron\" in line.lower():\n            words = line.split(\" \")\n            if len(words) > 2:\n                neuron_name = \" \".join(words[-2:])\n                break\n    return neuron_name\n\n\ndef _get_tpu_device_name() -> str:\n    return xm.xla_device_hw(xm.xla_device())\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/huggingface.py",
    "content": "from collections import OrderedDict\nfrom typing import (\n    Union,\n    Iterable,\n    List,\n    Dict,\n    Tuple,\n    Type,\n    Any,\n)\n\nimport numpy as np\n\nfrom nebullvm.core.models import Device, DeviceType\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch, Module\n\ntry:\n    from transformers import (\n        PreTrainedModel,\n    )\n    from transformers.tokenization_utils import PreTrainedTokenizer\nexcept ImportError:\n    # add placeholders for function definition\n    PreTrainedModel = None\n    PreTrainedTokenizer = None\n\n\nclass PyTorchTransformerWrapper(Module):\n    \"\"\"Class for wrappering the Transformers and give them an API compatible\n    with nebullvm. The class takes and input of the forward method positional\n    arguments and transform them in the input dictionaries needed by\n    transformers classes. At the end it also flattens their output.\n    \"\"\"\n\n    def __init__(\n        self,\n        core_model: Module,\n        encoded_input: Dict[str, torch.Tensor],\n    ):\n        super().__init__()\n        self.core_model = core_model\n        self.inputs_types = OrderedDict()\n        for key, value in encoded_input.items():\n            self.inputs_types[key] = value.dtype\n\n    def forward(self, *args: torch.Tensor):\n        inputs = {\n            key: value for key, value in zip(self.inputs_types.keys(), args)\n        }\n        outputs = self.core_model(**inputs)\n        outputs = outputs.values() if isinstance(outputs, dict) else outputs\n        return tuple(flatten_outputs(outputs))\n\n\nclass TensorFlowTransformerWrapper(tf.keras.Model):\n    def __init__(\n        self,\n        core_model: tf.Module,\n        encoded_input: Dict[str, tf.Tensor],\n    ):\n        super().__init__()\n        self.core_model = core_model\n        self.inputs_types = OrderedDict()\n        for key, value in encoded_input.items():\n            self.inputs_types[key] = value.dtype\n\n    def call(self, *args: tf.Tensor):\n        inputs = {\n            key: value for key, value in zip(self.inputs_types.keys(), args[0])\n        }\n        outputs = self.core_model(**inputs)\n        outputs = outputs.values() if isinstance(outputs, dict) else outputs\n        return tuple(flatten_outputs(list(outputs)))\n\n\ndef flatten_outputs(\n    outputs: Union[torch.Tensor, tf.Tensor, Iterable]\n) -> List[Union[torch.Tensor, tf.Tensor]]:\n    new_outputs = []\n    for output in outputs:\n        if isinstance(output, (torch.Tensor, tf.Tensor)):\n            new_outputs.append(output)\n        else:\n            flatten_list = flatten_outputs(output)\n            new_outputs.extend(flatten_list)\n    return new_outputs\n\n\ndef get_size_recursively(\n    tensor_tuple: Union[torch.Tensor, tf.Tensor, Tuple]\n) -> List[int]:\n    if isinstance(tensor_tuple[0], (torch.Tensor, tf.Tensor)):\n        return [len(tensor_tuple)]\n    else:\n        inner_size = get_size_recursively(tensor_tuple[0])\n        return [len(tensor_tuple), *inner_size]\n\n\ndef get_output_structure_from_text(\n    text: str,\n    model: PreTrainedModel,\n    tokenizer: PreTrainedTokenizer,\n    tokenizer_args: Dict,\n    device: Device,\n) -> Tuple[OrderedDict, Type]:\n    \"\"\"Function needed for saving in a dictionary the output structure of the\n    transformers model.\n    \"\"\"\n    encoded_input = tokenizer([text], **tokenizer_args)\n    if isinstance(model, torch.nn.Module):\n        encoded_input = encoded_input.to(device.to_torch_format())\n    output = model(**encoded_input)\n    structure = OrderedDict()\n    if isinstance(output, tuple):\n        for i, value in enumerate(output):\n            if isinstance(value, (torch.Tensor, tf.Tensor)):\n                structure[f\"output_{i}\"] = None\n            else:\n                size = get_size_recursively(value)\n                structure[f\"output_{i}\"] = size\n    else:\n        for key, value in output.items():\n            if isinstance(value, (torch.Tensor, tf.Tensor)):\n                structure[key] = None\n            else:\n                size = get_size_recursively(value)\n                structure[key] = size\n    return structure, type(output)\n\n\ndef get_output_structure_from_dict(\n    input_example: Dict,\n    model: PreTrainedModel,\n    device: Device,\n) -> Tuple[OrderedDict, Type]:\n    \"\"\"Function needed for saving in a dictionary the output structure of the\n    transformers model.\n    \"\"\"\n\n    if (\n        isinstance(model, torch.nn.Module)\n        and device.type is not DeviceType.TPU\n    ):\n        model.to(device.to_torch_format())\n        input_example.to(device.to_torch_format())\n\n    output = model(**input_example)\n    structure = OrderedDict()\n    if isinstance(output, tuple):\n        for i, value in enumerate(output):\n            if isinstance(value, (torch.Tensor, tf.Tensor)):\n                structure[f\"output_{i}\"] = None\n            else:\n                size = get_size_recursively(value)\n                structure[f\"output_{i}\"] = size\n    else:\n        for key, value in output.items():\n            if isinstance(value, (torch.Tensor, tf.Tensor)):\n                structure[key] = None\n            else:\n                size = get_size_recursively(value)\n                structure[key] = size\n    return structure, type(output)\n\n\ndef restructure_output(\n    output: Tuple[Union[torch.Tensor, tf.Tensor]],\n    structure: OrderedDict,\n    output_type: Any = None,\n):\n    \"\"\"Restructure the flatter output using the structure dictionary given as\n    input.\n    \"\"\"\n    output_dict = {}\n    idx = 0\n    for key, value in structure.items():\n        if value is None:\n            output_dict[key] = output[idx]\n            idx += 1\n        else:\n            tensor_shape = output[idx].shape[1:]\n            stack_fn = (\n                torch.stack\n                if isinstance(output[idx], torch.Tensor)\n                else tf.stack\n            )\n            reshape_fn = (\n                torch.reshape\n                if isinstance(output[idx], torch.Tensor)\n                else tf.reshape\n            )\n\n            output_dict[key] = list(\n                reshape_fn(\n                    stack_fn(\n                        output[idx : int(np.prod(value)) + idx]  # noqa E203\n                    ),\n                    (*value, *tensor_shape),\n                )\n            )\n            idx += np.prod(value)\n    if output_type is not None:\n        return output_type(**output_dict)\n    return output_dict\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/logger.py",
    "content": "import logging\nimport os\nimport sys\nimport warnings\nfrom typing import Any\n\nfrom loguru import logger\n\n\nlevels_map = {\n    0: \"ERROR\",\n    1: \"WARNING\",\n    2: \"INFO\",\n    3: \"DEBUG\",\n}\n\n\ndef debug_mode_enabled():\n    return int(os.environ.get(\"DEBUG_MODE\", \"0\")) > 0\n\n\ndef setup_logger():\n    if not debug_mode_enabled():\n        warnings.filterwarnings(\"ignore\")\n\n    logging_level = int(os.environ.get(\"NEBULLVM_LOG_LEVEL\", \"2\"))\n\n    logger.remove()\n    logger.add(\n        sys.stdout,\n        colorize=True,\n        format=(\n            \"<green>{time:YYYY-MM-DD HH:mm:ss}</green> | \"\n            \"<level>{level: <8}</level> | <level>{message}</level>\"\n        ),\n        level=levels_map[logging_level],\n    )\n    logger.level(\"WARNING\", color=\"<fg #d3d3d3>\")\n\n\nclass LoggingContext(object):\n    def __init__(\n        self,\n        logger: logging.Logger,\n        disabled: bool = False,\n        handler: Any = None,\n        close: bool = True,\n    ):\n        self.logger = logger\n        self.disabled = disabled\n        self.handler = handler\n        self.close = close\n\n    def __enter__(self):\n        self.logger.disabled = self.disabled\n        if self.handler:\n            self.logger.addHandler(self.handler)\n\n    def __exit__(self, et: Any, ev: Any, tb: Any):\n        if self.disabled is True:\n            self.logger.disabled = False\n        if self.handler:\n            self.logger.removeHandler(self.handler)\n        if self.handler and self.close:\n            self.handler.close()\n        # implicit return of None => don't swallow exceptions\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/onnx.py",
    "content": "from typing import List, Tuple, Any, Optional, Dict\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.config import ONNX_PROVIDERS\nfrom nebullvm.core.models import (\n    DeepLearningFramework,\n    Device,\n    DeviceType,\n    InputInfo,\n    DataType,\n)\nfrom nebullvm.optional_modules.onnx import onnx\nfrom nebullvm.optional_modules.onnxruntime import onnxruntime as ort\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\n\n\ndef convert_to_numpy(tensor: Any):\n    if isinstance(tensor, torch.Tensor):\n        tensor = tensor.cpu().detach().numpy()\n    elif isinstance(tensor, tf.Tensor) and tensor is not None:\n        tensor = tensor.numpy()\n    elif isinstance(tensor, int):\n        tensor = np.array([tensor])\n    else:\n        if not isinstance(tensor, np.ndarray):\n            raise TypeError(f\"Unsupported data type: {type(tensor)}\")\n    return tensor\n\n\ndef convert_to_target_framework(\n    tensor: np.ndarray, framework: DeepLearningFramework\n) -> Any:\n    if framework is DeepLearningFramework.PYTORCH:\n        return torch.from_numpy(tensor)\n    elif framework is DeepLearningFramework.TENSORFLOW:\n        return tf.convert_to_tensor(tensor)\n    else:\n        return tensor\n\n\ndef get_input_names(onnx_model: str):\n    model = onnx.load(onnx_model)\n    input_all = [node.name for node in model.graph.input]\n    return input_all\n\n\ndef get_output_names(onnx_model: str):\n    model = onnx.load(onnx_model)\n    output_all = [node.name for node in model.graph.output]\n    return output_all\n\n\ndef run_onnx_model(\n    onnx_model: str, input_tensors: List[np.ndarray], device: Device\n) -> List[np.ndarray]:\n    from nebullvm.optional_modules.onnxruntime import onnxruntime as ort\n\n    if device.type is DeviceType.GPU and len(ONNX_PROVIDERS[\"cuda\"]) == 3:\n        ONNX_PROVIDERS[\"cuda\"][1] = (\n            \"CUDAExecutionProvider\",\n            {\n                \"device_id\": device.idx,\n            },\n        )\n\n    model = ort.InferenceSession(\n        onnx_model,\n        providers=ONNX_PROVIDERS[\"cuda\"][1:]\n        if device.type is DeviceType.GPU\n        else ONNX_PROVIDERS[\"cpu\"],\n    )\n    inputs = {\n        name: array\n        for name, array in zip(get_input_names(onnx_model), input_tensors)\n    }\n    res = model.run(\n        output_names=get_output_names(onnx_model), input_feed=inputs\n    )\n    return list(res)\n\n\ndef _extract_dynamic_axis(\n    onnx_model: str,\n    data: List[Tuple[Tuple[np.ndarray, ...], np.ndarray]],\n    input_sizes: List[Tuple[int, ...]],\n    device: Device,\n    max_data: int = 100,\n) -> Optional[Dict]:\n    from nebullvm.tools.utils import inspect_dynamic_size\n\n    dynamic_axis = {\"inputs\": [{}] * len(input_sizes), \"outputs\": []}\n    output_sizes = []\n    for i, input_data in enumerate(data):\n        input_tensors = input_data[0]\n        if i >= max_data:\n            break\n        inspect_dynamic_size(\n            input_tensors, input_sizes, dynamic_axis[\"inputs\"]\n        )\n        outputs = tuple(\n            run_onnx_model(onnx_model, list(input_tensors), device)\n        )\n        if i == 0:\n            dynamic_axis[\"outputs\"] = [{}] * len(outputs)\n            output_sizes = [tuple(output.shape[1:]) for output in outputs]\n        inspect_dynamic_size(outputs, output_sizes, dynamic_axis[\"outputs\"])\n    if any(\n        len(x) > 0 for x in (dynamic_axis[\"inputs\"] + dynamic_axis[\"outputs\"])\n    ):\n        return dynamic_axis\n    return None\n\n\ndef extract_info_from_np_data(\n    onnx_model: str,\n    data: List[Tuple[Tuple[np.ndarray, ...], np.ndarray]],\n    dynamic_axis: Dict,\n    device: Device,\n    **kwargs,\n):\n    from nebullvm.tools.utils import ifnone\n\n    input_row = data[0][0]\n    batch_size = int(input_row[0].shape[0])\n    if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):\n        logger.warning(\"Detected not consistent batch size in the inputs.\")\n\n    input_sizes = [tuple(x.shape) for x in input_row]\n    input_types = [\n        \"int32\"\n        if x.dtype is np.int32\n        else \"int64\"\n        if x.dtype is np.int64\n        else \"float16\"\n        if x.dtype is np.float16\n        else \"float32\"\n        for x in input_row\n    ]\n    dynamic_axis = ifnone(\n        dynamic_axis,\n        _extract_dynamic_axis(onnx_model, data, input_sizes, device),\n    )\n    return batch_size, input_sizes, input_types, dynamic_axis\n\n\ndef get_output_info_onnx(\n    onnx_model: str, input_tensors: List[np.ndarray], device\n) -> List[Tuple[Tuple[int, ...], DataType]]:\n    res = run_onnx_model(onnx_model, input_tensors, device)\n    sizes = [\n        (tuple(output.shape), DataType.from_framework_format(output.dtype))\n        for output in res\n    ]\n    return sizes\n\n\ndef create_model_inputs_onnx(input_infos: List[InputInfo]) -> List[np.ndarray]:\n    input_tensors = (\n        np.random.randn(*input_info.size).astype(np.float32)\n        if input_info.dtype is DataType.FLOAT32\n        else np.random.randint(\n            size=input_info.size,\n            low=input_info.min_value or 0,\n            high=input_info.max_value or 100,\n        )\n        for input_info in input_infos\n    )\n    return list(input_tensors)\n\n\ndef onnx_is_gpu_available():\n    return ort.get_device() == \"GPU\"\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/pytorch.py",
    "content": "from pathlib import Path\nfrom typing import List, Tuple, Optional, Dict, Union, Sequence\n\nfrom loguru import logger\n\nfrom nebullvm.core.models import Device, DataType, DeviceType, InputInfo\nfrom nebullvm.optional_modules.torch import torch, DataLoader\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.diffusers import get_default_dynamic_info\n\nFX_MODULE_NAME = \"NebullvmFxModule\"\n\n\ndef save_with_torch_fx(model: torch.nn.Module, path: Path):\n    traced_model = torch.fx.symbolic_trace(model)\n    traced_model.to_folder(path, FX_MODULE_NAME)\n\n\ndef load_with_torch_fx(\n    path: Path, state_dict_name: str = \"pruned_state_dict.pt\"\n):\n    module_file = path / \"module.py\"\n    with open(module_file, \"r\") as f:\n        module_str = f.read()\n    exec(module_str, globals())\n    model = eval(FX_MODULE_NAME)()\n    model.load_state_dict(torch.load(path / state_dict_name))\n    return model\n\n\ndef get_output_info_torch(\n    torch_model: torch.nn.Module,\n    input_tensors: List[torch.Tensor],\n    device: Device,\n) -> List[Tuple[Tuple[int, ...], DataType]]:\n    if device.type is DeviceType.GPU:\n        input_tensors = [x.to(device.to_torch_format()) for x in input_tensors]\n        torch_model.to(device.to_torch_format())\n    with torch.no_grad():\n        outputs = torch_model(*input_tensors)\n        if isinstance(outputs, torch.Tensor):\n            return [\n                (\n                    tuple(outputs.size()),\n                    DataType.from_framework_format(outputs.dtype),\n                )\n            ]\n        else:\n            return [\n                (\n                    tuple(output.size()),\n                    DataType.from_framework_format(output.dtype),\n                )\n                for output in outputs\n            ]\n\n\ndef create_model_inputs_torch(\n    input_infos: List[InputInfo],\n) -> List[torch.Tensor]:\n    input_tensors = (\n        torch.randn(*input_info.size)\n        if input_info.dtype is DataType.FLOAT32\n        else torch.randint(\n            size=input_info.size,\n            low=input_info.min_value or 0,\n            high=input_info.max_value or 100,\n        )\n        for input_info in input_infos\n    )\n    return list(input_tensors)\n\n\ndef run_torch_model(\n    torch_model: torch.nn.Module,\n    input_tensors: List[torch.Tensor],\n    device: Device,\n    dtype: torch.dtype = torch.float,\n) -> List[torch.Tensor]:\n    torch_model.eval()\n    if device.type is DeviceType.GPU:\n        torch_model.to(device.to_torch_format())\n        if dtype != torch.half:\n            input_tensors = (\n                t.to(device.to_torch_format()) for t in input_tensors\n            )\n        else:\n            input_tensors = (\n                t.to(device.to_torch_format()).half()\n                if t.dtype == torch.float\n                else t.to(device.to_torch_format())\n                for t in input_tensors\n            )\n    with torch.no_grad():\n        pred = torch_model(*input_tensors)\n    if isinstance(pred, torch.Tensor):\n        pred = [pred.cpu()]\n    else:\n        pred = [p.cpu() for p in pred]\n    return pred\n\n\ndef _extract_dynamic_axis(\n    torch_model: torch.nn.Module,\n    dataloader: DataManager,\n    input_sizes: List[Tuple[int, ...]],\n    device: Device,\n    max_data: int = 100,\n) -> Optional[Dict]:\n    from nebullvm.tools.utils import inspect_dynamic_size\n\n    dynamic_axis = {\"inputs\": [{}] * len(input_sizes), \"outputs\": []}\n    output_sizes = []\n    for i, input_data in enumerate(dataloader):\n        input_tensors = input_data[0]\n        if i >= max_data:\n            break\n        inspect_dynamic_size(\n            input_tensors, input_sizes, dynamic_axis[\"inputs\"]\n        )\n        outputs = tuple(run_torch_model(torch_model, input_tensors, device))\n        if i == 0:\n            dynamic_axis[\"outputs\"] = [{}] * len(outputs)\n            output_sizes = [tuple(output.shape) for output in outputs]\n        inspect_dynamic_size(outputs, output_sizes, dynamic_axis[\"outputs\"])\n    if any(\n        len(x) > 0 for x in (dynamic_axis[\"inputs\"] + dynamic_axis[\"outputs\"])\n    ):\n        return dynamic_axis\n    return None\n\n\ndef extract_info_from_torch_data(\n    model: torch.nn.Module,\n    dataloader: Union[DataLoader, Sequence],\n    dynamic_axis: Dict,\n    device: Device,\n    is_diffusion: bool = False,\n):\n    from nebullvm.tools.utils import ifnone\n\n    input_data = (\n        dataloader[0]\n        if isinstance(dataloader, Sequence)\n        else next(iter(dataloader))\n    )\n    input_row = input_data[0]\n    batch_size = int(input_row[0].shape[0])\n    if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):\n        logger.warning(\"Detected not consistent batch size in the inputs.\")\n\n    input_sizes = [tuple(x.shape) for x in input_row]\n    input_types = [\n        \"int64\"\n        if isinstance(x.cpu(), torch.LongTensor)\n        else \"int32\"\n        if isinstance(x.cpu(), torch.IntTensor)\n        else \"float16\"\n        if isinstance(x.cpu(), torch.HalfTensor)\n        else \"float32\"\n        for x in input_row\n    ]\n\n    # For the Stable Diffusion UNet we must provide dynamic axis\n    # even when using static shapes, because otherwise the converted\n    # onnx model will have size issues.\n    if dynamic_axis is None and device.type is DeviceType.GPU and is_diffusion:\n        dynamic_axis = get_default_dynamic_info(input_sizes)\n\n    if dynamic_axis is not None:\n        dynamic_axis[\"inputs\"] = [\n            {int(k): v for (k, v) in val.items()}\n            for val in dynamic_axis[\"inputs\"]\n        ]\n        dynamic_axis[\"outputs\"] = [\n            {int(k): v for (k, v) in val.items()}\n            for val in dynamic_axis[\"outputs\"]\n        ]\n\n    dynamic_axis = ifnone(\n        dynamic_axis,\n        _extract_dynamic_axis(model, dataloader, input_sizes, device),\n    )\n    return batch_size, input_sizes, input_types, dynamic_axis\n\n\ndef torch_is_gpu_available():\n    return torch.cuda.is_available()\n\n\ndef torch_get_device_name():\n    return torch.cuda.get_device_name(0)\n\n\ndef get_torch_model_size(\n    model: Union[torch.nn.Module, torch.jit.ScriptModule, torch.fx.GraphModule]\n):\n    param_size = 0\n    for param in model.parameters():\n        param_size += param.nelement() * param.element_size()\n    buffer_size = 0\n    for buffer in model.buffers():\n        buffer_size += buffer.nelement() * buffer.element_size()\n\n    return param_size + buffer_size\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/tests/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/tests/test_data.py",
    "content": "import tensorflow as tf\nimport torch\n\nfrom nebullvm.tools.data import DataManager\n\n\ndef test_custom_input_data():\n    input_data = [\n        ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),\n        ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),\n        ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),\n        ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),\n    ]\n\n    data_manager = DataManager(input_data)\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 1\n    assert data_manager[0][0][0].shape == (2, 3, 10, 10)\n    assert data_manager[0][1].shape == (2, 1)\n\n\ndef test_torch_dataloader_single_input_with_label():\n    dataset = torch.utils.data.TensorDataset(\n        torch.randn(8, 3, 10, 10), torch.randn(8, 1)\n    )\n    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)\n    data_manager = DataManager.from_dataloader(dataloader)\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 1\n    assert data_manager[0][0][0].shape == (2, 3, 10, 10)\n    assert data_manager[0][1].shape == (2, 1)\n\n\ndef test_torch_dataloader_two_inputs_with_label():\n    dataset = torch.utils.data.TensorDataset(\n        torch.randn(8, 3, 10, 10), torch.randn(8, 3, 10, 10), torch.randn(8, 1)\n    )\n\n    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)\n    data_manager = DataManager.from_dataloader(dataloader)\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 2\n    assert data_manager[0][0][0].shape == (2, 3, 10, 10)\n    assert data_manager[0][0][1].shape == (2, 3, 10, 10)\n    assert data_manager[0][1].shape == (2, 1)\n\n\ndef test_torch_dataloader_three_inputs_with_label():\n    dataset = torch.utils.data.TensorDataset(\n        torch.randn(8, 3, 10, 10),\n        torch.randn(8, 3, 10, 10),\n        torch.randn(8, 3, 10, 10),\n        torch.randn(8, 1),\n    )\n    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)\n    data_manager = DataManager.from_dataloader(dataloader)\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 3\n    assert data_manager[0][0][0].shape == (2, 3, 10, 10)\n    assert data_manager[0][0][1].shape == (2, 3, 10, 10)\n    assert data_manager[0][0][2].shape == (2, 3, 10, 10)\n    assert data_manager[0][1].shape == (2, 1)\n\n\ndef test_torch_dataloader_single_input_without_label():\n    dataset = torch.utils.data.TensorDataset(torch.randn(8, 3, 10, 10))\n    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)\n    data_manager = DataManager.from_dataloader(dataloader)\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 1\n    assert data_manager[0][0][0].shape == (2, 3, 10, 10)\n\n\ndef test_tensorflow_dataloader_single_input_with_label():\n    dataset = tf.data.Dataset.from_tensor_slices(\n        (tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 1]))\n    )\n    data_manager = DataManager.from_dataloader(dataset.batch(2))\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 1\n    assert data_manager[0][0][0].shape == (2, 10, 10, 3)\n    assert data_manager[0][1].shape == (2, 1)\n\n\ndef test_tensorflow_dataloader_two_inputs_with_label():\n    dataset = tf.data.Dataset.from_tensor_slices(\n        (\n            tf.random.normal([8, 10, 10, 3]),\n            tf.random.normal([8, 10, 10, 3]),\n            tf.random.normal([8, 1]),\n        )\n    )\n    data_manager = DataManager.from_dataloader(dataset.batch(2))\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 2\n    assert data_manager[0][0][0].shape == (2, 10, 10, 3)\n    assert data_manager[0][0][1].shape == (2, 10, 10, 3)\n    assert data_manager[0][1].shape == (2, 1)\n\n\ndef test_tensorflow_dataloader_three_inputs_with_label():\n    dataset = tf.data.Dataset.from_tensor_slices(\n        (\n            tf.random.normal([8, 10, 10, 3]),\n            tf.random.normal([8, 10, 10, 3]),\n            tf.random.normal([8, 10, 10, 3]),\n            tf.random.normal([8, 1]),\n        )\n    )\n    data_manager = DataManager.from_dataloader(dataset.batch(2))\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 3\n    assert data_manager[0][0][0].shape == (2, 10, 10, 3)\n    assert data_manager[0][0][1].shape == (2, 10, 10, 3)\n    assert data_manager[0][0][2].shape == (2, 10, 10, 3)\n    assert data_manager[0][1].shape == (2, 1)\n\n\ndef test_tensorflow_dataloader_single_input_without_label():\n    dataset = tf.data.Dataset.from_tensor_slices(\n        tf.random.normal([8, 10, 10, 3])\n    )\n    data_manager = DataManager.from_dataloader(dataset.batch(2))\n\n    assert len(data_manager) == 4\n    assert len(data_manager[0]) == 2\n    assert len(data_manager[0][0]) == 1\n    assert data_manager[0][0][0].shape == (2, 10, 10, 3)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/tests/test_hardware_utils.py",
    "content": "import unittest\nfrom unittest.mock import patch\n\nfrom nebullvm.tools import hardware_utils\n\n\nclass TestGetHwSetup(unittest.TestCase):\n    @patch(\n        \"nebullvm.tools.hardware_utils.gpu_is_available\", return_value=False\n    )\n    @patch(\n        \"nebullvm.tools.hardware_utils.tpu_is_available\", return_value=False\n    )\n    @patch(\n        \"nebullvm.tools.hardware_utils.neuron_is_available\", return_value=False\n    )\n    def test_hw_setup__gpu_not_available(self, *_):\n        setup = hardware_utils.get_hw_setup()\n        self.assertIsNone(setup.accelerator)\n        self.assertGreater(len(setup.cpu), 0)\n        self.assertGreater(len(setup.operating_system), 0)\n        self.assertGreater(setup.memory_gb, 0)\n\n    @patch(\"nebullvm.tools.hardware_utils.gpu_is_available\", return_value=True)\n    @patch(\n        \"nebullvm.tools.hardware_utils._get_gpu_name\", return_value=\"mock-gpu\"\n    )\n    def test_hw_setup__gpu_is_available(self, *_):\n        setup = hardware_utils.get_hw_setup()\n        self.assertEqual(\"mock-gpu\", setup.accelerator)\n        self.assertGreater(len(setup.cpu), 0)\n        self.assertGreater(len(setup.operating_system), 0)\n        self.assertGreater(setup.memory_gb, 0)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/tests/test_utils.py",
    "content": "import unittest\nfrom unittest.mock import patch\n\nfrom nebullvm.core.models import DeviceType\nfrom nebullvm.tools import utils\n\n\nclass TestGetThroughput(unittest.TestCase):\n    def test_latency_is_zero(self):\n        self.assertEqual(-1, utils.get_throughput(0, 10))\n\n\nclass TestCheckDevice(unittest.TestCase):\n    @patch(\"nebullvm.tools.utils.gpu_is_available\", return_value=False)\n    @patch(\"nebullvm.tools.utils.tpu_is_available\", return_value=False)\n    @patch(\"nebullvm.tools.utils.neuron_is_available\", return_value=False)\n    def test_device_is_none_no_device_available(self, *_):\n        device = utils.check_device()\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n    @patch(\"nebullvm.tools.utils.gpu_is_available\", return_value=True)\n    @patch(\"nebullvm.tools.utils.neuron_is_available\", return_value=False)\n    @patch(\"nebullvm.tools.utils.tpu_is_available\", return_value=False)\n    def test_device_is_none_gpu_is_available(self, *_):\n        device = utils.check_device()\n        self.assertEqual(DeviceType.GPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n    @patch(\"nebullvm.tools.utils.tpu_is_available\", return_value=True)\n    @patch(\"nebullvm.tools.utils.gpu_is_available\", return_value=False)\n    @patch(\"nebullvm.tools.utils.neuron_is_available\", return_value=False)\n    def test_device_is_none_tpu_is_available(self, *_):\n        device = utils.check_device()\n        self.assertEqual(DeviceType.TPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n    @patch(\"nebullvm.tools.utils.neuron_is_available\", return_value=True)\n    @patch(\"nebullvm.tools.utils.gpu_is_available\", return_value=False)\n    @patch(\"nebullvm.tools.utils.tpu_is_available\", return_value=False)\n    def test_device_is_none_neuron_is_available(self, *_):\n        device = utils.check_device()\n        self.assertEqual(DeviceType.NEURON, device.type)\n        self.assertEqual(device.idx, 0)\n\n    def test_device_is_cpu(self):\n        device = utils.check_device(\"cpu\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n    @patch(\"nebullvm.tools.utils.gpu_is_available\", return_value=False)\n    def test_device_is_gpu_no_gpu_available(self, _):\n        device = utils.check_device(\"gpu\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"cuda\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"cuda:1\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"gpu:2\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n    @patch(\"nebullvm.tools.utils.gpu_is_available\", return_value=True)\n    def test_device_is_gpu_gpu_is_available(self, _):\n        device = utils.check_device(\"gpu\")\n        self.assertEqual(DeviceType.GPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"cuda\")\n        self.assertEqual(DeviceType.GPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"cuda:1\")\n        self.assertEqual(DeviceType.GPU, device.type)\n        self.assertEqual(device.idx, 1)\n\n        device = utils.check_device(\"gpu:2\")\n        self.assertEqual(DeviceType.GPU, device.type)\n        self.assertEqual(device.idx, 2)\n\n    @patch(\"nebullvm.tools.utils.tpu_is_available\", return_value=False)\n    def test_device_is_tpu_no_tpu_available(self, _):\n        device = utils.check_device(\"tpu\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"tpu:1\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n    @patch(\"nebullvm.tools.utils.tpu_is_available\", return_value=True)\n    def test_device_is_tpu_tpu_is_available(self, _):\n        device = utils.check_device(\"tpu\")\n        self.assertEqual(DeviceType.TPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"tpu:1\")\n        self.assertEqual(DeviceType.TPU, device.type)\n        self.assertEqual(device.idx, 1)\n\n    @patch(\"nebullvm.tools.utils.neuron_is_available\", return_value=False)\n    def test_device_is_neuron_no_neuron_available(self, _):\n        device = utils.check_device(\"neuron\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"neuron:1\")\n        self.assertEqual(DeviceType.CPU, device.type)\n        self.assertEqual(device.idx, 0)\n\n    @patch(\"nebullvm.tools.utils.neuron_is_available\", return_value=True)\n    def test_device_is_neuron_neuron_is_available(self, _):\n        device = utils.check_device(\"neuron\")\n        self.assertEqual(DeviceType.NEURON, device.type)\n        self.assertEqual(device.idx, 0)\n\n        device = utils.check_device(\"neuron:1\")\n        self.assertEqual(DeviceType.NEURON, device.type)\n        self.assertEqual(device.idx, 1)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/tf.py",
    "content": "from typing import Union, List, Tuple, Any, Optional, Dict\n\nimport numpy as np\nfrom loguru import logger\n\nfrom nebullvm.core.models import Device, DataType, InputInfo\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\n\n\ndef get_output_info_tf(\n    tf_model: Union[tf.Module, tf.keras.Model],\n    input_tensors: List[tf.Tensor],\n    device: Device,\n) -> List[Tuple[Tuple[int, ...], DataType]]:\n    with tf.device(device.to_tf_format()):\n        outputs = tf_model(input_tensors)\n    if isinstance(outputs, tf.Tensor) and outputs is not None:\n        return [\n            (\n                tuple(outputs.shape),\n                DataType.from_framework_format(outputs.dtype),\n            )\n        ]\n    return [\n        (tuple(x.shape), DataType.from_framework_format(x.dtype))\n        for x in outputs\n    ]\n\n\ndef create_model_inputs_tf(input_infos: List[InputInfo]) -> List[tf.Tensor]:\n    return [\n        tf.random_normal_initializer()(\n            shape=(\n                input_info.size[0],\n                *input_info.size[2:],\n                input_info.size[1],\n            )\n        )\n        if input_info.dtype is DataType.FLOAT32\n        else tf.random.uniform(\n            shape=(\n                input_info.size[0],\n                *input_info.size[2:],\n                input_info.size[1],\n            ),\n            minval=input_info.min_value or 0,\n            maxval=input_info.max_value or 100,\n            dtype=tf.int32,\n        )\n        for input_info in input_infos\n    ]\n\n\ndef run_tf_model(\n    model: tf.Module,\n    input_tensors: Tuple[tf.Tensor],\n    device: Device,\n) -> Tuple[tf.Tensor]:\n    with tf.device(device.to_tf_format()):\n        pred = model(input_tensors)\n    if isinstance(pred, tf.Tensor):\n        pred = (pred,)\n    return pred\n\n\ndef _extract_dynamic_axis(\n    tf_model: tf.Module,\n    dataset: List[Tuple[Tuple[tf.Tensor, ...], Any]],\n    input_sizes: List[Tuple[int, ...]],\n    device: Device,\n    max_data: int = 100,\n) -> Optional[Dict]:\n    from nebullvm.tools.utils import inspect_dynamic_size\n\n    dynamic_axis = {\"inputs\": [{}] * len(input_sizes), \"outputs\": []}\n    output_sizes = []\n    for i, input_data in enumerate(dataset):\n        input_tensors = input_data[0]\n        if i >= max_data:\n            break\n        inspect_dynamic_size(\n            input_tensors, input_sizes, dynamic_axis[\"inputs\"]\n        )\n        outputs = tuple(run_tf_model(tf_model, input_tensors, device))\n        if i == 0:\n            dynamic_axis[\"outputs\"] = [{}] * len(outputs)\n            output_sizes = [tuple(output.shape[1:]) for output in outputs]\n        inspect_dynamic_size(outputs, output_sizes, dynamic_axis[\"outputs\"])\n    if any(\n        len(x) > 0 for x in (dynamic_axis[\"inputs\"] + dynamic_axis[\"outputs\"])\n    ):\n        return dynamic_axis\n    return None\n\n\ndef extract_info_from_tf_data(\n    tf_model: tf.Module,\n    dataset: List[Tuple[Tuple[tf.Tensor, ...], Any]],\n    dynamic_axis: Dict,\n    device: Device,\n    **kwargs,\n):\n    from nebullvm.tools.utils import ifnone\n\n    input_row = dataset[0][0]\n    batch_size = int(input_row[0].shape[0])\n    if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):\n        logger.warning(\"Detected not consistent batch size in the inputs.\")\n\n    input_sizes = [tuple(x.shape) for x in input_row]\n    input_types = [\n        \"int32\"\n        if x.dtype in [tf.int32, np.int32]\n        else \"int64\"\n        if x.dtype in [tf.int64, np.int64]\n        else \"float16\"\n        if x.dtype in [tf.float16, np.float16]\n        else \"float32\"\n        for x in input_row\n    ]\n\n    dynamic_axis = ifnone(\n        dynamic_axis,\n        _extract_dynamic_axis(tf_model, dataset, input_sizes, device),\n    )\n    return batch_size, input_sizes, input_types, dynamic_axis\n\n\ndef tensorflow_is_gpu_available():\n    return len(tf.config.list_physical_devices(\"GPU\")) > 0\n\n\ndef tensorflow_get_gpu_name():\n    gpu_devices = tf.config.list_physical_devices(\"GPU\")\n    if gpu_devices:\n        details = tf.config.experimental.get_device_details(gpu_devices[0])\n        details.get(\"device_name\", \"Unknown GPU\")\n        return details[\"device_name\"]\n    else:\n        return \"Unknown GPU\"\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/transformations.py",
    "content": "import copy\nfrom abc import ABC, abstractmethod\nfrom typing import List, Any, Dict\n\nimport numpy as np\n\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\n\n\nclass BaseTransformation(ABC):\n    @abstractmethod\n    def _transform(self, _input: Any, **kwargs) -> Any:\n        raise NotImplementedError()\n\n    def __call__(self, _input: Any, **kwargs):\n        return self._transform(_input, **kwargs)\n\n    def to_dict(self):\n        return {\n            \"module\": self.__class__.__module__,\n            \"name\": self.__class__.__name__,\n        }\n\n    @classmethod\n    def from_dict(cls, tfm_dict: Dict):\n        return cls()\n\n\nclass MultiStageTransformation(BaseTransformation):\n    def __init__(self, transformations: List[BaseTransformation]):\n        self._tfms = transformations\n\n    def _transform(self, _input: Any, **kwargs) -> Any:\n        for tfm in self._tfms:\n            _input = tfm(_input, **kwargs)\n        return _input\n\n    def append(self, __tfm: BaseTransformation):\n        self._tfms.append(__tfm)\n\n    def extend(self, tfms: List[BaseTransformation]):\n        self._tfms += tfms\n\n    def to_dict(self) -> Dict:\n        return {\"tfms\": [tfm.to_dict() for tfm in self._tfms]}\n\n    def to_list(self):\n        return self._tfms\n\n    @classmethod\n    def from_dict(cls, tfms_dict: Dict):\n        tfms = []\n        for tfm_dict in tfms_dict[\"tfms\"]:\n            exec(f\"from {tfm_dict['module']} import {tfm_dict['name']}\")\n            tfm = eval(tfm_dict[\"name\"]).from_dict(tfm_dict)\n            tfms.append(tfm)\n        return cls(tfms)\n\n    def copy(self):\n        new_list = copy.deepcopy(self._tfms)\n        return self.__class__(new_list)\n\n    def __len__(self):\n        return len(self._tfms)\n\n\nclass HalfPrecisionTransformation(BaseTransformation):\n    @staticmethod\n    def _transform_numpy(_input: np.ndarray) -> np.ndarray:\n        return _input.astype(dtype=np.float16)\n\n    @staticmethod\n    def _transform_tf(_input: tf.Tensor) -> tf.Tensor:\n        return tf.cast(_input, tf.float16)\n\n    @staticmethod\n    def _transform_torch(_input: torch.Tensor) -> torch.Tensor:\n        return _input.half()\n\n    def _transform(self, _input: Any, **kwargs) -> Any:\n        if isinstance(_input, np.ndarray):\n            return (\n                self._transform_numpy(_input)\n                if _input.dtype == np.float32\n                else _input\n            )\n        elif isinstance(_input, torch.Tensor):\n            return (\n                self._transform_torch(_input)\n                if _input.dtype == torch.float32\n                else _input\n            )\n        elif isinstance(_input, tf.Tensor) and _input is not None:\n            return (\n                self._transform_tf(_input)\n                if _input.dtype == tf.float32\n                else _input\n            )\n        else:\n            raise TypeError(\n                f\"The given input type is not currently supported. \"\n                f\"Got {type(_input)}, expected one between (np.ndarray, \"\n                f\"torch.Tensor, tf.Tensor)\"\n            )\n\n\nclass NoOp(BaseTransformation):\n    def _transform(self, _input: Any, **kwargs):\n        return _input\n\n\nclass VerifyContiguity(BaseTransformation):\n    def _transform(self, _input: Any, **kwargs) -> Any:\n        if not isinstance(_input, torch.Tensor):\n            return _input\n        if not _input.is_contiguous():\n            _input = _input.contiguous()\n        return _input\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/utils.py",
    "content": "import os\nimport subprocess\nimport sys\nimport uuid\nfrom pathlib import Path\nfrom types import ModuleType\nfrom typing import (\n    Tuple,\n    Any,\n    List,\n    Dict,\n    Union,\n    Iterable,\n    Sequence,\n    Optional,\n    Callable,\n)\n\nimport numpy as np\nfrom loguru import logger\nfrom packaging import version\n\nfrom nebullvm.core.models import (\n    DeepLearningFramework,\n    Device,\n    ModelParams,\n    DeviceType,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.onnx import (\n    extract_info_from_np_data,\n    get_output_info_onnx,\n)\nfrom nebullvm.tools.pytorch import (\n    extract_info_from_torch_data,\n    get_output_info_torch,\n)\nfrom nebullvm.tools.tf import (\n    extract_info_from_tf_data,\n    get_output_info_tf,\n)\n\n\ndef get_model_size_mb(model: Any) -> float:\n    if isinstance(model, str):\n        size = os.stat(model).st_size\n    elif isinstance(model, Path):\n        size = os.path.getsize(model.as_posix())\n    elif isinstance(model, torch.nn.Module):\n        size = sum(p.nelement() * p.element_size() for p in model.parameters())\n    else:\n        # we assume it is a tf_model\n        # assuming full precision 32 bit\n        size = model.count_params() * 4\n    return round(size * 1e-6, 2)\n\n\ndef get_model_name(model: Any) -> str:\n    if isinstance(model, str):\n        return model\n    if isinstance(model, Path):\n        return model.as_posix()\n    return model.__class__.__name__\n\n\ndef generate_model_id(model: Any) -> str:\n    model_name = get_model_name(model)\n    return f\"{str(uuid.uuid4())}_{hash(model_name)}\"\n\n\ndef get_throughput(latency: float, batch_size: int = 1) -> float:\n    if latency == 0:\n        return -1\n    return (1 / latency) * batch_size\n\n\ndef ifnone(target, new_value):\n    if target is None:\n        return new_value\n    else:\n        return target\n\n\ndef inspect_dynamic_size(\n    tensors: Tuple[Any, ...],\n    sizes: List[Tuple[int, ...]],\n    axis_list: List[Dict],\n):\n    for idx, (tensor, size) in enumerate(zip(tensors, sizes)):\n        for idy, (j, k) in enumerate(zip(tensor.shape, size)):\n            if j != k:\n                if idy == 0:\n                    tag = \"batch_size\"\n                else:\n                    tag = f\"val_{j}_{k}\"\n                axis_list[idx][idy] = tag\n\n\ndef gpu_is_available():\n    try:\n        subprocess.check_output(\"nvidia-smi\")\n        return True\n    except Exception:\n        return False\n\n\ndef neuron_is_available():\n    try:\n        subprocess.check_output(\"neuron-ls\")\n        return True\n    except Exception:\n        return False\n\n\ndef tpu_is_available():\n    # Check if a tpu is available\n    try:\n        import torch_xla\n        import torch_xla.core.xla_model as xm\n\n        return xm.xla_device_hw(torch_xla.core.xla_model.xla_device()) == \"TPU\"\n    except Exception:\n        return False\n\n\ndef check_module_version(\n    module: ModuleType, min_version: str = None, max_version: str = None\n) -> bool:\n    installed_version = module.__version__\n\n    if min_version is not None:\n        if version.parse(installed_version) < version.parse(min_version):\n            return False\n\n    if max_version is not None:\n        if version.parse(installed_version) > version.parse(max_version):\n            return False\n\n    return True\n\n\ndef is_python_version_3_10():\n    return (\n        str(sys.version_info.major) + \".\" + str(sys.version_info.minor)\n        == \"3.10\"\n    )\n\n\ndef get_dl_framework(model: Any):\n    if isinstance(model, torch.nn.Module):\n        return DeepLearningFramework.PYTORCH\n    elif isinstance(model, tf.Module) and model is not None:\n        return DeepLearningFramework.TENSORFLOW\n    elif isinstance(model, str):\n        if Path(model).is_file():\n            return DeepLearningFramework.NUMPY\n        else:\n            raise FileNotFoundError(\n                f\"No file '{model}' found, please provide a valid path to \"\n                f\"a model.\"\n            )\n    else:\n        raise TypeError(f\"Model type {type(model)} not supported.\")\n\n\ndef check_input_data(input_data: Union[Iterable, Sequence]):\n    try:\n        assert len(input_data) > 0\n        assert isinstance(input_data[0], tuple)\n        assert isinstance(input_data[0][0], tuple)\n        assert isinstance(\n            input_data[0][0][0], (np.ndarray, torch.Tensor, tf.Tensor)\n        )\n        if len(input_data[0]) > 1:\n            assert isinstance(\n                input_data[0][1],\n                (np.ndarray, torch.Tensor, tf.Tensor, int, float, type(None)),\n            )\n    except:  # noqa E722\n        return False\n    else:\n        return True\n\n\ndef is_data_subscriptable(input_data: Union[Iterable, Sequence]):\n    try:\n        input_data[0]\n    except:  # noqa E722\n        return False\n    else:\n        return True\n\n\ndef check_dynamic_info_inputs(\n    dynamic_info: Optional[Dict], input_sample: Tuple[Any]\n):\n    if dynamic_info is not None:\n        assert dynamic_info.get(\"inputs\") is not None, (\n            \"Dynamic info must contain an 'inputs' key with a list of \"\n            \"dictionaries as value.\"\n        )\n\n        num_dynamic_inputs = len(dynamic_info[\"inputs\"])\n        num_model_inputs = len(input_sample)\n        assert len(dynamic_info[\"inputs\"]) == len(input_sample), (\n            f\"The number of dynamic inputs provided in the dynamic info \"\n            f\"dict ({num_dynamic_inputs}) is not equal to the number \"\n            f\"of inputs of the model ({num_model_inputs}). Detected model \"\n            f\"input shapes are: {[input.shape for input in input_sample]} \"\n        )\n\n        assert dynamic_info.get(\"outputs\") is not None, (\n            \"Dynamic info must contain an 'outputs' key with a list of \"\n            \"dictionaries as value.\"\n        )\n\n\ndef extract_info_from_data(\n    model: Any,\n    input_data: DataManager,\n    dl_framework: DeepLearningFramework,\n    dynamic_info: Optional[Dict],\n    device: Device,\n    is_diffusion: bool = False,\n):\n    check_dynamic_info_inputs(dynamic_info, input_data.get_list(1)[0])\n    batch_size, input_sizes, input_types, dynamic_info = INFO_EXTRACTION_DICT[\n        dl_framework\n    ](\n        model,\n        input_data,\n        dynamic_axis=dynamic_info,\n        device=device,\n        is_diffusion=is_diffusion,\n    )\n\n    output_infos = OUTPUT_INFO_COMPUTATION_DICT[dl_framework](\n        model, input_data[0][0], device\n    )\n    model_params = ModelParams(\n        batch_size=batch_size,\n        input_infos=[\n            {\"size\": size, \"dtype\": dtype}\n            for size, dtype in zip(input_sizes, input_types)\n        ],\n        output_sizes=[info[0] for info in output_infos],\n        output_types=[info[1] for info in output_infos],\n        dynamic_info=dynamic_info,\n    )\n    return model_params\n\n\ndef is_huggingface_data(data_sample: Any) -> bool:\n    if is_dict_type(data_sample):\n        return True\n    elif isinstance(data_sample, str):\n        return True\n    elif isinstance(data_sample[0], str):\n        return True\n    return False\n\n\ndef is_dict_type(data_sample: Any):\n    try:\n        data_sample.items()\n    except AttributeError:\n        return False\n    else:\n        return True\n\n\ndef _get_idx(device: str) -> int:\n    device_info = device.split(\":\")\n    if len(device_info) == 2 and device_info[1].isdigit():\n        idx = int(device_info[1])\n    else:\n        idx = 0\n    return idx\n\n\ndef _set_device(\n    accelerator_is_available: bool, device_type: DeviceType, idx: int\n) -> Device:\n    if not accelerator_is_available:\n        logger.warning(\n            f\"Selected {device_type.name} device but no available \"\n            f\"{device_type.name} found on this platform. CPU will \"\n            f\"be used instead. Please make sure that the \"\n            f\"{device_type.name} is installed and can be used by your \"\n            \"framework.\"\n        )\n        device = Device(DeviceType.CPU)\n    else:\n        device = Device(device_type, idx=idx)\n\n    return device\n\n\ndef check_device(device: Optional[str] = None) -> Device:\n    if device is None:\n        if gpu_is_available():\n            device = Device(DeviceType.GPU)\n        elif neuron_is_available():\n            device = Device(DeviceType.NEURON)\n        elif tpu_is_available():\n            device = Device(DeviceType.TPU)\n        else:\n            device = Device(DeviceType.CPU)\n    else:\n        if any(x in device.lower() for x in [\"cuda\", \"gpu\"]):\n            device = _set_device(\n                accelerator_is_available=gpu_is_available(),\n                device_type=DeviceType.GPU,\n                idx=_get_idx(device),\n            )\n        elif \"neuron\" in device.lower():\n            device = _set_device(\n                accelerator_is_available=neuron_is_available(),\n                device_type=DeviceType.NEURON,\n                idx=_get_idx(device),\n            )\n        elif \"tpu\" in device.lower():\n            device = _set_device(\n                accelerator_is_available=tpu_is_available(),\n                device_type=DeviceType.TPU,\n                idx=_get_idx(device),\n            )\n        else:\n            device = Device(DeviceType.CPU)\n\n    return device\n\n\ndef get_gpu_compute_capability(gpu_idx: int) -> float:\n    compute_capability = subprocess.check_output(\n        [\"nvidia-smi\", \"--query-gpu=compute_cap\", \"--format=csv,noheader\"]\n    ).decode(\"utf-8\")\n    return float(compute_capability.split(\"\\n\")[gpu_idx])\n\n\nINFO_EXTRACTION_DICT: Dict[DeepLearningFramework, Callable] = {\n    DeepLearningFramework.PYTORCH: extract_info_from_torch_data,\n    DeepLearningFramework.TENSORFLOW: extract_info_from_tf_data,\n    DeepLearningFramework.NUMPY: extract_info_from_np_data,\n}\n\nOUTPUT_INFO_COMPUTATION_DICT: Dict[DeepLearningFramework, Callable] = {\n    DeepLearningFramework.PYTORCH: get_output_info_torch,\n    DeepLearningFramework.TENSORFLOW: get_output_info_tf,\n    DeepLearningFramework.NUMPY: get_output_info_onnx,\n}\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm/tools/venv.py",
    "content": "import subprocess\nimport tempfile\nimport venv\n\nfrom loguru import logger\n\n\nclass EnvBuilder(venv.EnvBuilder):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.context = None\n\n    def post_setup(self, context):\n        self.context = context\n\n\ndef run_in_different_venv(\n    requirements_file: str,\n    script_path: str,\n    use_gpu: bool,\n    *args,\n):\n    \"\"\"Run a python scripts in a new temporary environment. Arguments for the\n    script must be passed in the function args.\n    it is equivalent to create and activate a new environment and running\n    > pip install -r $requirement_file\n    > python -m script_path *args\n    Args:\n        requirements_file (str): File (.txt) containing the list of\n            requirements.\n        script_path (str): Path to the script that must be run.\n        args: Arguments of the script.\n    \"\"\"\n    logger.debug(f\"Debug: Running script {script_path} in a new virtual env.\")\n    with tempfile.TemporaryDirectory() as target_dir_path:\n        logger.debug(\"Debug: Creating virtual environment...\")\n        venv_builder = EnvBuilder(with_pip=True)\n        venv_builder.create(str(target_dir_path))\n        venv_context = venv_builder.context\n\n        logger.debug(\"Debug: Installing requirements...\")\n\n        if use_gpu:\n            pip_install_command = [\n                venv_context.env_exe,\n                \"-m\",\n                \"pip\",\n                \"install\",\n                \"torch==1.9.1+cu111\",\n                \"torchvision==0.10.1+cu111\",\n                \"-f\",\n                \"https://download.pytorch.org/whl/torch_stable.html\",\n            ]\n        else:\n            pip_install_command = [\n                venv_context.env_exe,\n                \"-m\",\n                \"pip\",\n                \"install\",\n                \"torch<=1.9.1\",\n                \"torchvision<=0.10.1\",\n            ]\n        subprocess.check_call(pip_install_command)\n\n        pip_install_command = [\n            venv_context.env_exe,\n            \"-m\",\n            \"pip\",\n            \"install\",\n            \"-r\",\n            requirements_file,\n        ]\n        subprocess.check_call(pip_install_command)\n\n        logger.debug(\"Debug: Executing script...\")\n        script_command = [venv_context.env_exe, script_path, *args]\n        subprocess.check_call(script_command)\n"
  },
  {
    "path": "optimization/nebullvm/nebullvm.toml",
    "content": "[build-system]\nrequires = [\n    \"setuptools>=42\",\n    \"wheel\"\n]\nbuild-backend = \"setuptools.build_meta\""
  },
  {
    "path": "optimization/nebullvm/requirements-dev.txt",
    "content": "pytest\npytest-mock\ntorchvision\nsentencepiece\n"
  },
  {
    "path": "optimization/nebullvm/requirements.txt",
    "content": "numpy>=1.21.0, <1.24.0\npackaging>=21.3\npy-cpuinfo==8.0.0\nPyYAML>=6.0\npsutil>=5.0.0\nrequests>=2.26.1\ntqdm>=4.36.0\nloguru>=0.5.3"
  },
  {
    "path": "optimization/nebullvm/setup.py",
    "content": "from pathlib import Path\nfrom setuptools import setup, find_packages\n\n\nREQUIREMENTS = [\n    \"numpy>=1.21.0, <1.24.0\",\n    \"py-cpuinfo>=8.0.0\",\n    \"PyYAML>=6.0\",\n    \"psutil>=5.0.0\",\n    \"requests>=2.26.0\",\n    \"tqdm>=4.36.0\",\n    \"packaging>=21.3\",\n    \"loguru>=0.5.3\",\n]\n\nthis_directory = Path(__file__).parent\nlong_description = (this_directory / \"README.md\").read_text(encoding=\"utf8\")\n\nsetup(\n    name=\"nebullvm\",\n    version=\"0.10.0\",\n    packages=find_packages(),\n    install_requires=REQUIREMENTS,\n    long_description=long_description,\n    include_package_data=True,\n    long_description_content_type=\"text/markdown\",\n)\n"
  },
  {
    "path": "optimization/open_alpha_tensor/README.md",
    "content": "# 🐉 OpenAlphaTensor\nOpenAlphaTensor provides an open-source implementation of Deepmind's AlphaTensor algorithm.\n\nWith OpenAlphaTensor, you can increase the computational performances of an AI model with custom-generated matrix multiplication algorithms. You can train your own AlphaTensor algorithm for a specific matrix size or fine-tune a pre-trained AlphaTensor model to produce optimized kernels for a specific hardware.\n\nOpenAlphaTensor is based on Deepmind's paper [Discovering Faster Matrix Multiplication Algorithms with Reinforcement Learning](https://www.nature.com/articles/s41586-022-05172-4).\n\nIf you appreciate the project, show it by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers)\n\n## 🧑‍🏫 Installation\nYou can install the package cloning the repository and running the following commands:\n```bash\ngit clone https://github.com/nebuly-ai/nebullvm.git\ncd nebullvm/apps/accelerate/open_alpha_tensor\npip install -e .\n```\n\n## 🚀 Get started\nFor training your AlphaTensor model, you can execute the following command:\n```bash\npython main.py \n```\nModel parameters can be given either as command line arguments or as a JSON file. The `config.json` file contains the default parameters for training a model for matrix size 4x4x4.\n\nAlternatively, if you want to have a more fine-grained control over the training process, you can use the python API:\n```python\nfrom open_alpha_tensor import train_alpha_tensor\n\ncardinality_vector = 5  # The actions can have values in range [-2, 2]\nN_bar = 100  # parameter for smoothing the temperature while adjusting the probability distribution\nmatrix_size = 5\ninput_size = matrix_size**2\nn_steps = 15\nn_actions = cardinality_vector ** (3 * input_size // n_steps)\naction_memory = 7\n\ntrain_alpha_tensor(\n    tensor_length=action_memory + 1,\n    input_size=input_size,\n    scalars_size=1,\n    emb_dim=2048,\n    n_steps=n_steps,\n    n_logits=n_actions,\n    n_samples=32,\n    device=\"cuda\",\n    len_data=2048,\n    n_synth_data=1000000,\n    pct_synth=0.7,\n    batch_size=32,\n    epochs=600000,\n    lr=1e-4,\n    lr_decay_factor=0.5,\n    lr_decay_steps=5000,\n    weight_decay=1e-5,\n    optimizer_name=\"adamw\",\n    loss_params=(1, 1),\n    limit_rank=150,\n    checkpoint_dir=\"path/to/checkpoint/dir\",\n    checkpoint_data_dir=\"path/where/to/save/data/generated/by/the/model\",\n    n_actors=1,\n    mc_n_sim=200,\n    n_cob=100000,\n    cob_prob=0.9983,\n    data_augmentation=True,\n    N_bar=N_bar,\n    random_seed=42,\n    extra_devices=None,\n    save_dir=\"path/to/save/final/model\",\n)\n```\n\n## 🧪 Missing features\n- [ ] Release weights of pre-trained models. **Coming out soon**.\n- [ ] Add compilation of Alpha Tensor kernels in OpenAI's Triton and JAX/XLA.\n- [ ] Add support for fine-tuning on target hardware.\n- [ ] Support training on Multiple GPUs (it allows training on a larger batch size).\n- [ ] Add support for other compilers (e.g. llvm).\n- [ ] Reduce memory footprint of the Acting Agent.\n- [ ] Improve acting speed.\n\n## 💫 Contributing\n\nWe welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the [linked](https://docs.nebuly.com/contributions) page for more information on how to get involved.\n\nA special thanks to [BrianPulfer](https://github.com/BrianPulfer) for his awesome contribution to the OpenAlphaTensor module.\n"
  },
  {
    "path": "optimization/open_alpha_tensor/config.json",
    "content": "{\n    \"batch_size\": 16,\n    \"max_epochs\": 600000,\n    \"action_memory\": 7,\n    \"optimizer\": \"adamw\",\n    \"weight_decay\": 1e-5,\n    \"lr\": 1e-4,\n    \"lr_decay_factor\": 0.1,\n    \"lr_decay_steps\": 500000,\n    \"device\": \"cuda:0\",\n    \"len_data\": 2048,\n    \"pct_synth\": 0.9,\n    \"n_synth_data\": 100000,\n    \"limit_rank\": 125,\n    \"alpha\": 1.0,\n    \"beta\": 1.0,\n    \"matrix_size\": 4,\n    \"embed_dim\": 1024,\n    \"actions_sampled\": 32,\n    \"n_actors\": 1,\n    \"mc_n_sim\": 200,\n    \"n_cob\": 100000,\n    \"cob_prob\": 0.9983,\n    \"cardinality_vector\": 5,\n    \"n_bar\": 100\n}"
  },
  {
    "path": "optimization/open_alpha_tensor/main.py",
    "content": "import json\nimport os\nfrom argparse import ArgumentParser\nfrom pathlib import Path\n\nfrom open_alpha_tensor import train_alpha_tensor\n\n\ndef _compute_largest_divisor(n: int) -> int:\n    \"\"\"Compute the largest divisor of n.\"\"\"\n    for i in range(n // 2, 0, -1):\n        if n % i == 0:\n            return i\n    return 1\n\n\ndef main():\n    config_file = Path(os.getenv(\"CONFIG_FILE\", \"config.json\"))\n    if config_file.exists():\n        with open(config_file) as f:\n            config = json.load(f)\n    else:\n        config = {}\n    parser = ArgumentParser()\n    parser.add_argument(\"--batch_size\", type=int, default=1)\n    parser.add_argument(\"--max_epochs\", type=int, default=1)\n    parser.add_argument(\"--action_memory\", type=int, default=1)\n    parser.add_argument(\"--optimizer\", type=str, default=\"adamw\")\n    parser.add_argument(\"--weight_decay\", type=float, default=1e-5)\n    parser.add_argument(\"--lr\", type=float, default=1e-4)\n    parser.add_argument(\"--lr_decay_factor\", type=float, default=0.5)\n    parser.add_argument(\"--lr_decay_steps\", type=int, default=5000)\n    parser.add_argument(\"--device\", type=str, default=\"cuda\")\n    # parser.add_argument(\"--half\", action=\"store_true\")\n    parser.add_argument(\"--len_data\", type=int, default=100)\n    parser.add_argument(\"--pct_synth\", type=float, default=0.5)\n    parser.add_argument(\"--n_synth_data\", type=int, default=100)\n    parser.add_argument(\"--limit_rank\", type=int, default=15)\n    parser.add_argument(\"--alpha\", type=float, default=1.0)\n    parser.add_argument(\"--beta\", type=float, default=1.0)\n    parser.add_argument(\"--random_seed\", type=int, default=None)\n    parser.add_argument(\"--checkpoint_dir\", type=str, default=None)\n    parser.add_argument(\"--checkpoint_data_dir\", type=str, default=None)\n    parser.add_argument(\"--matrix_size\", type=int, default=3)\n    parser.add_argument(\"--embed_dim\", type=int, default=1024)\n    parser.add_argument(\"--actions_sampled\", type=int, default=10)\n    parser.add_argument(\"--n_actors\", type=int, default=1)\n    parser.add_argument(\"--mc_n_sim\", type=int, default=100)\n    parser.add_argument(\"--n_cob\", type=int, default=100000)\n    parser.add_argument(\"--cob_prob\", type=float, default=0.9983)  # 1 - 0.0017\n    parser.add_argument(\"--data_augmentation\", action=\"store_true\")\n    parser.add_argument(\"--cardinality_vector\", type=int, default=5)\n    parser.add_argument(\n        \"--n_bar\",\n        type=int,\n        default=100,\n        help=\"N_bar parameter for policy temperature.\",\n    )\n    parser.add_argument(\"--save_dir\", type=str, default=None)\n    parser.add_argument(\"extra_devices\", nargs=\"*\", type=str, default=[])\n    parser.set_defaults(**config)\n    args = parser.parse_args()\n\n    cardinality_vector = args.cardinality_vector\n    N_bar = args.n_bar\n    input_size = args.matrix_size**2\n    n_steps = _compute_largest_divisor(input_size)\n    n_actions = cardinality_vector ** (3 * input_size // n_steps)\n    loss_params = (args.alpha, args.beta)\n\n    train_alpha_tensor(\n        tensor_length=args.action_memory + 1,\n        input_size=input_size,\n        scalars_size=1,\n        emb_dim=args.embed_dim,\n        n_steps=n_steps,\n        n_logits=n_actions,\n        n_samples=args.actions_sampled,\n        device=args.device,\n        len_data=args.len_data,\n        n_synth_data=args.n_synth_data,\n        pct_synth=args.pct_synth,\n        batch_size=args.batch_size,\n        epochs=args.max_epochs,\n        lr=args.lr,\n        lr_decay_factor=args.lr_decay_factor,\n        lr_decay_steps=args.lr_decay_steps,\n        weight_decay=args.weight_decay,\n        optimizer_name=args.optimizer,\n        loss_params=loss_params,\n        limit_rank=args.limit_rank,\n        random_seed=args.random_seed,\n        checkpoint_dir=args.checkpoint_dir,\n        checkpoint_data_dir=args.checkpoint_data_dir,\n        n_actors=args.n_actors,\n        mc_n_sim=args.mc_n_sim,\n        n_cob=args.n_cob,\n        cob_prob=args.cob_prob,\n        data_augmentation=args.data_augmentation or False,\n        N_bar=N_bar,\n        extra_devices=args.extra_devices,\n        save_dir=args.save_dir,\n    )\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/__init__.py",
    "content": "from open_alpha_tensor.api.functions import train_alpha_tensor  # noqa: F401\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/api/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/api/functions.py",
    "content": "from typing import List, Tuple\n\nfrom open_alpha_tensor.root_op import TrainAlphaTensorRootOp\n\n\ndef train_alpha_tensor(\n    tensor_length: int,\n    input_size: int,\n    scalars_size: int,\n    emb_dim: int,\n    n_steps: int,\n    n_logits: int,\n    n_samples: int,\n    optimizer_name: str,\n    lr: float,\n    lr_decay_factor: float,\n    lr_decay_steps: int,\n    weight_decay: float,\n    loss_params: Tuple[float, float],\n    checkpoint_dir: str,\n    checkpoint_data_dir: str,\n    epochs: int,\n    batch_size: int,\n    len_data: int,\n    n_synth_data: int,\n    pct_synth: float,\n    limit_rank: int,\n    n_actors: int,\n    mc_n_sim: int,\n    N_bar: int,\n    device: str,\n    save_dir: str,\n    random_seed: int,\n    n_cob: int,\n    cob_prob: float,\n    data_augmentation: bool,\n    extra_devices: List[str],\n):\n    \"\"\"Trains an AlphaTensor model to learn more efficient matrix\n    multiplications and returns it.\n\n    Args:\n        tensor_length (int): Number of tensors to as history.\n        input_size (int): Flattened size of the matrices to be multiplied.\n        scalars_size (int): Size of the scalar vectors fed to the torso model.\n        emb_dim (int): Embedding dimension.\n        n_steps (int): Number of steps used to get a single action out of a\n        triplet.\n        n_logits (int): Number of logits output by the policy head.\n        n_samples (int): Number of samples used by the policy head at\n        evaluation time.\n        optimizer_name (str): Name of the optimizer used.\n        lr (float): Learning rate.\n        lr_decay_factor (float): Learning rate's decay factor.\n        lr_decay_steps (int): Number of learning rate's decay steps.\n        weight_decay (float): Weight decay used by the optimizer.\n        loss_params (Tuple[float, float]): Alpha and Beta parameters used in\n        the loss function.\n        checkpoint_dir (str): Directory used to store model checkpoints.\n        checkpoint_data_dir (str): Directory used to store games as JSON files.\n        epochs (int): Number of training epochs.\n        batch_size (int): Batch size.\n        len_data (int): Number of training samples used (both actor generated\n        and synthetic).\n        n_synth_data (int): Number of synthetic training samples.\n        pct_synth (float): Initial percentage of synthetic samples used for\n        training.\n        limit_rank (int): Maximum number of steps per episode and maximum rank\n        for synthetically-generated matrices.\n        n_actors (int): Number of actors to play a single each game at each\n        training step.\n        mc_n_sim (int): Number of simulations during Monte Carlo tree search.\n        N_bar (int): N_bar parameter used to compute tau when improving the\n        policy.\n        device (str): The name of the torch device used for training.\n        save_dir (str): Directory where the final trained model will be stored.\n        random_seed (int): Randomizing seed.\n        n_cob (int): Number of change of basis (cob) used for a single\n        training sample.\n        cob_prob (float): Probability of applying a change of basis.\n        data_augmentation (bool): Whether to randomly swap the last operation\n        of an episode with another operation.\n        extra_devices (List[str]): Extra devices names used for multi-GPU\n        training.\n    \"\"\"\n    root_op = TrainAlphaTensorRootOp()\n    root_op.execute(\n        tensor_length=tensor_length,\n        input_size=input_size,\n        scalars_size=scalars_size,\n        emb_dim=emb_dim,\n        n_steps=n_steps,\n        n_logits=n_logits,\n        n_samples=n_samples,\n        optimizer_name=optimizer_name,\n        lr=lr,\n        lr_decay_factor=lr_decay_factor,\n        lr_decay_steps=lr_decay_steps,\n        weight_decay=weight_decay,\n        loss_params=loss_params,\n        checkpoint_dir=checkpoint_dir,\n        checkpoint_data_dir=checkpoint_data_dir,\n        epochs=epochs,\n        batch_size=batch_size,\n        len_data=len_data,\n        n_synth_data=n_synth_data,\n        pct_synth=pct_synth,\n        limit_rank=limit_rank,\n        n_actors=n_actors,\n        mc_n_sim=mc_n_sim,\n        N_bar=N_bar,\n        device=device,\n        save_dir=save_dir,\n        random_seed=random_seed,\n        n_cob=n_cob,\n        cob_prob=cob_prob,\n        data_augmentation=data_augmentation,\n        extra_devices=extra_devices,\n    )\n    return root_op.get_result()\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/config.py",
    "content": "BASE_CHECKPOINT_DIR = \"checkpoints\"\nBASE_CHECKPOINT_DATA_DIR = \"games\"\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/actors/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/actors/stage.py",
    "content": "from typing import Dict, List\n\nimport torch\n\nfrom open_alpha_tensor.core.data.utils import (\n    get_scalars,\n    map_action_to_triplet,\n)\nfrom open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel\n\n\ndef game_is_finished(state):\n    \"\"\"Tells if the game is finished or not.\n\n    Args:\n        state (torch.Tensor): The state of the game.\n    \"\"\"\n    # state size (1, S, S, S)\n    return (state == 0).all()\n\n\ndef remove_duplicates(reducing_tensor: torch.Tensor):\n    \"\"\"Remove duplicates from a tensor.\n\n    Args:\n        reducing_tensor (torch.Tensor): The tensor to remove duplicates from.\n    \"\"\"\n    # reducing tensor has shape (1, N_mc, S, S, S)\n    n_mc = reducing_tensor.shape[1]\n    indexes = []\n    idx_map = {}\n    for idx in range(n_mc):\n        if len(indexes) == 0:\n            indexes.append(idx)\n            idx_map[idx] = []\n        else:\n            idx_tensor = reducing_tensor[:, idx]\n            for index in indexes:\n                if (reducing_tensor[:, index] - idx_tensor == 0).all():\n                    idx_map[index].append(idx)\n                    break\n            else:\n                indexes.append(idx)\n                idx_map[idx] = []\n\n    # idx_map = {i: len(v) for i, v in enumerate(idx_map.values())}\n    old_idx_to_new_idx_map = {}\n    for new_idx, (key, values) in enumerate(idx_map.items()):\n        old_idx_to_new_idx_map[key] = new_idx\n        for second_idx in values:\n            old_idx_to_new_idx_map[second_idx] = new_idx\n    return (\n        reducing_tensor[:, indexes],\n        old_idx_to_new_idx_map,\n        idx_map,\n        indexes,\n    )\n\n\ndef extract_children_states_from_actions(\n    state: torch.Tensor,\n    actions: torch.Tensor,\n    vec_cardinality: int = 5,\n):\n    \"\"\"Extract the children states from the actions.\n\n    Args:\n        state (torch.Tensor): The state of the game.\n        actions (torch.Tensor): The actions to apply to the state.\n        vec_cardinality (int, optional): The cardinality of the vectors.\n    \"\"\"\n    # state (1, T, S, S, S)\n    # actions (1, K, N_steps)\n    # we assume actions to be with N_steps = 1,\n    #  and N_logits = |F|^(3S/N_steps). Each action is then mapped in a\n    #  unique way to a triplet (u, v, w) where each vector has size S.\n    # vector cardinality represents the number of values it can take an entry\n    #  of u, v or w.\n    bs, k, n_steps = actions.shape[:3]\n    len_token = 3 * state.shape[2] // n_steps\n    actions = map_action_to_triplet(actions, vec_cardinality, len_token)\n    actions = actions.reshape(bs, k, n_steps * len_token)\n    vec_dim = state.shape[2]\n    u = actions[:, :, :vec_dim].reshape(bs, k, vec_dim, 1, 1)\n    v = actions[:, :, vec_dim : 2 * vec_dim].reshape(  # noqa E203\n        bs, k, 1, vec_dim, 1\n    )\n    w = actions[:, :, 2 * vec_dim :].reshape(bs, k, 1, 1, vec_dim)  # noqa E203\n    reducing_tensor = u * v * w\n    (\n        reducing_tensor,\n        old_idx_to_new_idx,\n        repetition_map,\n        not_duplicate_indexes,\n    ) = remove_duplicates(reducing_tensor)\n    old_state = state[:, 0]\n    new_state = old_state.unsqueeze(1) - reducing_tensor\n    rolling_states = torch.roll(state, 1)[:, 2:]\n    return (\n        [\n            torch.cat(\n                [\n                    new_state[:, i : i + 1],  # noqa E203\n                    reducing_tensor[:, i : i + 1],  # noqa E203\n                    rolling_states,\n                ],\n                dim=1,\n            )\n            for i in range(k)\n        ],\n        old_idx_to_new_idx,\n        repetition_map,\n        not_duplicate_indexes,\n    )\n\n\ndef _reduce_memory_consumption_before_storing(\n    possible_states: List[torch.Tensor],\n):\n    \"\"\"Reduce the memory consumption before storing the states.\n\n    Args:\n        possible_states (List[torch.Tensor]): The possible states.\n    \"\"\"\n    final_states = [state[:, 0:2] for state in possible_states]\n    previous_actions = possible_states[0][:, 2:]\n    storing_dict = {\n        \"final_states\": final_states,\n        \"previous_actions\": previous_actions,\n    }\n    return storing_dict\n\n\ndef _recompose_possible_states(reduced_memory_states_dict: Dict):\n    \"\"\"Recompose the possible states from the reduced memory states.\n\n    Args:\n        reduced_memory_states_dict (Dict): The reduced memory states.\n    \"\"\"\n    final_states = reduced_memory_states_dict[\"final_states\"]\n    previous_actions = reduced_memory_states_dict[\"previous_actions\"]\n    possible_states = [\n        torch.cat(\n            [\n                final_states[i],\n                previous_actions,\n            ],\n            dim=1,\n        )\n        for i in range(len(final_states))\n    ]\n    return possible_states\n\n\ndef extract_present_state(state: torch.Tensor) -> torch.Tensor:\n    return state[:, 0]\n\n\ndef to_hash(tensor: torch.Tensor) -> str:\n    \"\"\"Converts a tensor to a hash string.\n\n    Args:\n        tensor: The tensor to convert.\n    \"\"\"\n    hashable_tensor = \"_\".join(\n        tensor.reshape(-1).long().detach().cpu().numpy().astype(str).tolist()\n    )\n    return hashable_tensor\n\n\ndef from_hash(hashable_tensor: str, shape: tuple) -> torch.Tensor:\n    \"\"\"Converts a hash string back to the original tensor.\n\n    Args:\n        hashable_tensor (str): The hash string.\n        shape (tuple): The shape of the original tensor.\n    \"\"\"\n    return torch.tensor([float(x) for x in hashable_tensor.split(\"_\")]).resize(\n        shape\n    )\n\n\ndef record_action(tree_dict: Dict, state: str, action: str):\n    \"\"\"Record the action in the tree dictionary.\n\n    Args:\n        tree_dict (Dict): The tree dictionary.\n        state (str): The state as a hash string.\n        action (str): The action as a hash string.\n    \"\"\"\n    if state in tree_dict:\n        tree_dict[state].append(action)\n    else:\n        tree_dict[state] = [action]\n\n\ndef select_future_state(\n    possible_states: List[torch.Tensor],\n    q_values: torch.Tensor,\n    N_s_a: torch.Tensor,\n    repetitions: Dict[int, list],\n    c_1: float = 1.25,\n    c_2: float = 19652,\n    return_idx: bool = False,\n) -> torch.Tensor:\n    \"\"\"Select the future state maximizing the upper confidence bound.\"\"\"\n    # q_values (1, K, 1)\n    pi = torch.tensor(\n        [\n            len(repetitions[i])\n            for i in range(len(possible_states))\n            if i in repetitions\n        ]\n    ).to(q_values.device)\n    if pi.shape[0] != N_s_a.shape[1]:\n        print(pi)\n        print(pi.shape, q_values.shape, N_s_a.shape)\n        pi = pi[: N_s_a.shape[1]]\n    ucb = q_values.reshape(-1) + pi * torch.sqrt(\n        torch.sum(N_s_a) / (1 + N_s_a)\n    ) * (c_1 + torch.log((torch.sum(N_s_a) + c_2 + 1) / c_2))\n    if return_idx:\n        return ucb.argmax()\n    return possible_states[ucb.argmax()]\n\n\n@torch.no_grad()\ndef simulate_game(\n    model,\n    state: torch.Tensor,\n    t_time: int,\n    max_steps: int,\n    game_tree: Dict,\n    states_dict: Dict,\n    horizon: int = 5,\n):\n    \"\"\"Simulates a game from a given state.\n\n    Args:\n        model: The model to use for the simulation.\n        state (torch.Tensor): The initial state.\n        t_time (int): The current time step.\n        max_steps (int): The maximum number of steps to simulate.\n        game_tree (Dict): The game tree.\n        states_dict (Dict): The states dictionary.\n        horizon (int): The horizon to use for the simulation.\n    \"\"\"\n    idx = t_time\n    max_steps = min(max_steps, t_time + horizon)\n    state_hash = to_hash(extract_present_state(state))\n    trajectory = []\n    # selection\n    while state_hash in game_tree:\n        (\n            possible_states_dict,\n            old_idx_to_new_idx,\n            repetition_map,\n            N_s_a,\n            q_values,\n            actions,\n        ) = states_dict[state_hash]\n        possible_states = _recompose_possible_states(possible_states_dict)\n        state_idx = select_future_state(\n            possible_states, q_values, N_s_a, repetition_map, return_idx=True\n        )\n        trajectory.append((state_hash, state_idx))  # state_hash, action_idx\n        future_state = extract_present_state(possible_states[state_idx])\n        state = possible_states[state_idx]\n        state_hash = to_hash(future_state)\n        idx += 1\n\n    # expansion\n    if idx <= max_steps:\n        trajectory.append((state_hash, None))\n        if not game_is_finished(extract_present_state(state)):\n            state = state.to(model.device)\n            scalars = get_scalars(state, idx).to(state.device)\n            actions, probs, q_values = model(state, scalars)\n            (\n                possible_states,\n                cloned_idx_to_idx,\n                repetitions,\n                not_dupl_indexes,\n            ) = extract_children_states_from_actions(\n                state,\n                actions,\n            )\n            not_dupl_actions = actions[:, not_dupl_indexes].to(\"cpu\")\n            not_dupl_q_values = torch.zeros(not_dupl_actions.shape[:-1]).to(\n                \"cpu\"\n            )\n            N_s_a = torch.zeros_like(not_dupl_q_values).to(\"cpu\")\n            present_state = extract_present_state(state)\n            states_dict[to_hash(present_state)] = (\n                _reduce_memory_consumption_before_storing(possible_states),\n                cloned_idx_to_idx,\n                repetitions,\n                N_s_a,\n                not_dupl_q_values,\n                not_dupl_actions,\n            )\n            game_tree[to_hash(present_state)] = [\n                to_hash(extract_present_state(fut_state))\n                for fut_state in possible_states\n            ]\n            leaf_q_value = q_values\n    else:\n        leaf_q_value = -int(torch.linalg.matrix_rank(state).sum())\n    # backup\n    backward_pass(trajectory, states_dict, leaf_q_value=leaf_q_value)\n\n\ndef backward_pass(trajectory, states_dict, leaf_q_value: torch.Tensor):\n    \"\"\"Backward pass of the montecarlo algorithm\"\"\"\n    reward = 0\n    for idx, (state, action_idx) in enumerate(reversed(trajectory)):\n        if action_idx is None:  # leaf node\n            reward += leaf_q_value\n        else:\n            (\n                _,\n                old_idx_to_new_idx,\n                _,\n                N_s_a,\n                q_values,\n                _,\n            ) = states_dict[state]\n            if isinstance(reward, torch.Tensor):\n                reward = reward.to(q_values.device)\n            action_idx = int(action_idx)\n            if action_idx in old_idx_to_new_idx:\n                not_dupl_index = old_idx_to_new_idx[int(action_idx)]\n            else:\n                not_dupl_index = action_idx\n            reward -= 1\n            q_values[:, not_dupl_index] = (\n                N_s_a[:, not_dupl_index] * q_values[:, not_dupl_index] + reward\n            ) / (N_s_a[:, not_dupl_index] + 1)\n            N_s_a[:, not_dupl_index] += 1\n\n\ndef monte_carlo_tree_search(\n    model: torch.nn.Module,\n    state: torch.Tensor,\n    n_sim: int,\n    t_time,\n    n_steps: int,\n    game_tree: Dict,\n    state_dict: Dict,\n):\n    \"\"\"Runs the monte carlo tree search algorithm.\n\n    Args:\n        model (torch.nn.Module): The model to use for the simulation.\n        state (torch.Tensor): The initial state.\n        n_sim (int): The number of simulations to run.\n        t_time (int): The current time step.\n        n_steps (int): The maximum number of steps to simulate.\n        game_tree (Dict): The game tree.\n        state_dict (Dict): The dictionary containing the states.\n    \"\"\"\n    # Note that game tree is not the full tree, but just the one having as root\n    #  the current node(state).\n    # should we accept also previous updated trajectories for the current node?\n    # is it something we should considering when deciding how many simulations\n    # we should run? (I think yes)\n    state_hash = to_hash(extract_present_state(state))\n    if state_hash in state_dict:\n        with torch.no_grad():\n            N_s_a = state_dict[state_hash][3]\n            n_sim -= int(N_s_a.sum())\n            n_sim = max(n_sim, 0)\n\n    for _ in range(n_sim):\n        simulate_game(model, state, t_time, n_steps, game_tree, state_dict)\n    # return next state\n    possible_states_dict, _, repetitions, N_s_a, q_values, _ = state_dict[\n        state_hash\n    ]\n    possible_states = _recompose_possible_states(possible_states_dict)\n    next_state_idx = select_future_state(\n        possible_states, q_values, N_s_a, repetitions, return_idx=True\n    )\n    next_state = possible_states[next_state_idx]\n    return next_state\n\n\n@torch.no_grad()\ndef compute_improved_policy(\n    state_dict: Dict,\n    states: List[str],\n    model_n_steps: int,\n    model_n_logits: int,\n    N_bar: int,\n):\n    \"\"\"Compute the improved policy given the state_dict, the list of states.\n    The improved policy is computed as (N_s_aˆ(1/tau) / (N_s_aˆ(1/tau)).sum())\n    where tau is (log(N_s_a.sum()) / log(N_bar))\n    \"\"\"\n    policies = torch.zeros(len(states), model_n_steps, model_n_logits)\n    N_bar = torch.tensor(N_bar)\n    for idx, state in enumerate(states):\n        N_s_a = state_dict[state][3]\n        actions = state_dict[state][5]\n        if N_s_a.sum() > N_bar:\n            tau = (torch.log(N_s_a.sum()) / torch.log(N_bar)).item()\n        else:\n            tau = 1\n        N_s_a = N_s_a ** (1 / tau)\n        improved_policy = N_s_a / N_s_a.sum()\n        for sample_id in range(actions.shape[1]):\n            action_ids = actions[0, sample_id]\n            for step_id, action_id in enumerate(action_ids):\n                policies[idx, step_id, action_id] += improved_policy[\n                    0, sample_id\n                ]\n    return policies\n\n\ndef actor_prediction(\n    model: AlphaTensorModel,\n    input_tensor: torch.Tensor,\n    maximum_rank: int,\n    mc_n_sim: int,\n    N_bar: int,\n    return_actions: bool = False,\n):\n    \"\"\"Runs the monte carlo tree search algorithm to obtain the next states,\n    policies and rewards.\n\n    Args:\n        model (AlphaTensorModel): The model to use for the simulation.\n        input_tensor (torch.Tensor): The initial state.\n        maximum_rank (int): The maximum number of steps to simulate.\n        mc_n_sim (int): The number of simulations to run.\n        N_bar (int): The parameter used to compute the improved policy.\n        return_actions (bool): If True, only actions are returned.\n    \"\"\"\n    # input_tensor has shape (1, T, S, S, S)\n    state = input_tensor\n    rank = 0\n    game_tree = {}\n    state_dict = {}\n    hash_states = []\n    states = []\n    while rank < maximum_rank:\n        states.append(state)\n        hash_states.append(to_hash(extract_present_state(state)))\n        state = monte_carlo_tree_search(\n            model,\n            state,\n            mc_n_sim,\n            rank,\n            maximum_rank,\n            game_tree,\n            state_dict,\n        )\n        if game_is_finished(extract_present_state(state)):\n            break\n        rank += 1\n    final_state = extract_present_state(state)\n    policies = compute_improved_policy(\n        state_dict, hash_states, model.n_steps, model.n_logits, N_bar\n    )\n    reward = (\n        int(torch.linalg.matrix_rank(final_state).sum())\n        if not game_is_finished(final_state)\n        else 0\n    )\n    rewards = torch.cumsum(\n        torch.tensor([-1] * (len(policies) - 1) + [reward]), dim=0\n    )\n    if return_actions:\n        actions = [state_dict[hash_state][5] for hash_state in hash_states]\n        return actions\n    # policies do not have the batch size, but states still have it\n    states = [s.squeeze(0) for s in states]\n    return states, policies, rewards\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/data/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/data/basis_change.py",
    "content": "from pathlib import Path\nfrom typing import Callable\n\nimport numpy as np\nimport torch\n\n\ndef get_change_basis_matrix(\n    tensor_size: int,\n    n_cob: int,\n    entry_distribution: Callable = torch.randn,\n    random_seed: int = None,\n):\n    \"\"\"Generate a list of change of basis matrices.\n\n    Args:\n        tensor_size (int): Size of the tensor.\n        n_cob (int): Number of change of basis matrices.\n        entry_distribution (Callable, optional): Distribution of the entries\n        of the change of basis matrices.\n        random_seed (int, optional): Random seed for reproducibility.\n    \"\"\"\n    if random_seed is not None:\n        torch.random.manual_seed(random_seed)\n    for _ in range(n_cob):\n        diag_p = 2 * (torch.rand(tensor_size) > 0.5).float() - 1\n        diag_l = 2 * (torch.rand(tensor_size) > 0.5).float() - 1\n        random_matrix = entry_distribution((tensor_size, tensor_size))\n        p_matrix = torch.diag(diag_p)\n        l_matrix = torch.diag(diag_l)\n        p_matrix = p_matrix + torch.triu(random_matrix, diagonal=1)\n        l_matrix = l_matrix + torch.tril(random_matrix, diagonal=-1)\n        yield torch.matmul(p_matrix, l_matrix)\n\n\ndef cob_entry_prob_distribution(size):\n    full_size = int(np.prod(size))\n    vals = torch.tensor([-1, 0, 1])\n    probs = torch.tensor([0.0075, 0.985, 0.0075]).unsqueeze(0)\n    cum_sum = torch.cumsum(probs, dim=-1)\n    unif_prob = torch.rand((full_size, 1))\n    tensor_idx = torch.argmax((unif_prob <= cum_sum).int(), dim=1)\n    tensor = vals[tensor_idx]\n    return tensor.reshape(size)\n\n\nclass ChangeOfBasis:\n    \"\"\"Change of Basis class.\"\"\"\n\n    \"\"\"Change of Basis class.\"\"\"\n\n    def __init__(\n        self,\n        tensor_size: int,\n        n_cob: int,\n        cob_prob: float,\n        device: str,\n        random_seed: int = None,\n    ):\n        \"\"\"Builds a ChangeOfBasis object.\n\n        Args:\n            tensor_size (int): Size of the tensor.\n            n_cob (int): Number of change of basis matrices.\n            cob_prob (float): Probability of applying a change of basis.\n            device (str): Name of the torch device to use.\n            random_seed (int, optional): Random seed for reproducibility.\n        \"\"\"\n        self.tmp_dir = Path.home() / \".data_alpha_tensor/cob_matrices\"\n        self.tmp_dir.mkdir(exist_ok=True, parents=True)\n        for i, cob_matrix in enumerate(\n            get_change_basis_matrix(\n                tensor_size, n_cob, cob_entry_prob_distribution, random_seed\n            )\n        ):\n            torch.save(cob_matrix, f\"{self.tmp_dir}/cob_matrix_{i}.pt\")\n        self.tensor_size = tensor_size\n        self.n_cob = n_cob\n        self.cob_prob = cob_prob\n        self.device = device\n\n    @torch.no_grad()\n    def __call__(self, tensor: torch.Tensor, return_basis: bool = False):\n        \"\"\"Apply a change of basis to a tensor.\n\n        Args:\n            tensor (torch.Tensor): Tensor to apply the change of basis to.\n            return_basis (bool, optional): Whether to return the change of\n            basis matrix as well.\n        \"\"\"\n        cob_prob = torch.rand(1).item()\n        if cob_prob > self.cob_prob:\n            return tensor\n        random_cob = torch.randint(low=0, high=self.n_cob, size=(1,))\n        cob_matrix = torch.load(\n            f\"{self.tmp_dir}/cob_matrix_{int(random_cob)}.pt\"\n        ).to(self.device)\n\n        # apply change of basis to each tensor dimension\n        inner_tensor = tensor[0, 0]\n        tensor_size = inner_tensor.shape[-1]\n        original_shape = inner_tensor.shape\n        cob_matrix = cob_matrix.transpose(0, 1)\n        inner_tensor = torch.matmul(\n            inner_tensor.reshape(-1, tensor_size), cob_matrix\n        ).reshape(original_shape)\n        inner_tensor = inner_tensor.permute(0, 2, 1)\n        inner_tensor = torch.matmul(\n            inner_tensor.reshape(-1, tensor_size), cob_matrix\n        ).reshape(original_shape)\n        inner_tensor = inner_tensor.permute(2, 1, 0)\n        inner_tensor = torch.matmul(\n            inner_tensor.reshape(-1, tensor_size), cob_matrix\n        ).reshape(original_shape)\n        inner_tensor = inner_tensor.permute(2, 0, 1)\n        tensor[0, 0] = inner_tensor\n        if return_basis:\n            return tensor, cob_matrix.transpose(0, 1)\n        return tensor\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/data/dataset.py",
    "content": "import json\nimport os\nimport shutil\nimport tempfile\nfrom pathlib import Path\nfrom typing import List, Tuple\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import Dataset\n\nfrom open_alpha_tensor.core.data.generation import generate_synthetic_data\nfrom open_alpha_tensor.core.data.utils import (\n    get_scalars,\n    map_triplet_to_action,\n)\n\nSAVE_DIR_SYNT = str(Path.home() / \".data_alpha_tensor/synthetic_data\")\n\n\ndef compute_move(triplets: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]):\n    \"\"\"Computes the outer product of the three tensors in the triplet that\n    will be subtracted from the current state.\n\n    Args:\n        triplets (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Tensors u,\n        v, and w.\n    \"\"\"\n    u, v, w = triplets\n    return u.reshape(-1, 1, 1) * v.reshape(1, -1, 1) * w.reshape(1, 1, -1)\n\n\nclass SyntheticDataBuffer(Dataset):\n    \"\"\"Dataset of synthetically generated demonstrations.\"\"\"\n\n    def __init__(\n        self,\n        tensor_size,\n        n_data,\n        limit_rank,\n        prob_distr,\n        n_prev_actions: int,\n        device: str,\n        n_steps: int,\n        random_seed=None,\n    ):\n        \"\"\"Builds a dataset of synthetic demonstrations.\n\n        Args:\n            tensor_size (int): Size of the tensor.\n            n_data (int): Number of demonstrations to generate.\n            limit_rank (int): Maximum rank of the generated tensors.\n            prob_distr (Callable): Probability distribution to use to generate\n            the tensors.\n            n_prev_actions (int): Number of previous actions to use as input.\n            device (str): Name of the torch device to use.\n            n_steps (int): Number of steps to perform in the environment.\n            random_seed (int, optional): Random seed to use.\n        \"\"\"\n        self.device = device\n        self.len_data = 0\n        self.n_prev_actions = n_prev_actions\n        self.limit_rank = limit_rank\n        self.n_steps = n_steps\n        self.save_dir = os.path.join(SAVE_DIR_SYNT, f\"size_{tensor_size}\")\n        Path(self.save_dir).mkdir(parents=True, exist_ok=True)\n        number_of_triplets = len(list(Path(self.save_dir).glob(\"*.pt\"))) // 2\n        if number_of_triplets < n_data:\n            self.len_data = number_of_triplets\n            for i, (output_tensor, list_of_triplets) in enumerate(\n                generate_synthetic_data(\n                    tensor_size,\n                    n_data - number_of_triplets,\n                    limit_rank,\n                    prob_distr,\n                    random_seed,\n                )\n            ):\n                torch.save(\n                    output_tensor,\n                    os.path.join(\n                        self.save_dir, f\"output_tensor_{self.len_data}.pt\"\n                    ),\n                )\n                torch.save(\n                    list_of_triplets,\n                    os.path.join(\n                        self.save_dir, f\"list_of_triplets_{self.len_data}.pt\"\n                    ),\n                )\n                self.len_data += 1\n        else:\n            self.len_data = n_data\n\n    def __len__(self):\n        return self.len_data * self.limit_rank\n\n    @torch.no_grad()\n    def __getitem__(self, idx):\n        i = idx // self.limit_rank\n        j = idx % self.limit_rank\n        output_tensor = torch.load(\n            os.path.join(self.save_dir, f\"output_tensor_{i}.pt\")\n        )\n        list_of_triplets = torch.load(\n            os.path.join(self.save_dir, f\"list_of_triplets_{i}.pt\")\n        )\n        if j != self.limit_rank - 1:\n            moves = list_of_triplets[j + 1 :]  # noqa E203\n            output_tensor = self._apply_moves(output_tensor, moves)\n        triplet = list_of_triplets[j]\n        output_tensor = torch.stack(\n            [\n                output_tensor,\n                *(\n                    compute_move(t)\n                    for t in reversed(\n                        list_of_triplets[\n                            j + 1 : j + 1 + self.n_prev_actions  # noqa E203\n                        ]\n                    )\n                ),\n            ]\n        )\n        if len(output_tensor) < self.n_prev_actions + 1:\n            output_tensor = torch.cat(\n                [\n                    output_tensor,\n                    torch.zeros(\n                        self.n_prev_actions + 1 - len(output_tensor),\n                        *output_tensor.shape[1:],\n                    ),\n                ]\n            )\n        policy = map_triplet_to_action(triplet, base=5, n_steps=self.n_steps)\n        reward = torch.tensor([-(j + 1)])\n        scalar = get_scalars(output_tensor, self.limit_rank - j, with_bs=False)\n        return (\n            output_tensor.to(self.device),\n            scalar.to(self.device),\n            policy.to(self.device),\n            reward.to(self.device),\n        )\n\n    @staticmethod\n    def _apply_moves(\n        tensor: torch.Tensor,\n        moves: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]],\n    ):\n        \"\"\"Given an initial state and a list of moves, applies the moves to\n        the state.\n\n        Args:\n            tensor (torch.Tensor): Initial state.\n            moves (List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):\n            List of moves.\n        \"\"\"\n        for u, v, w in moves:\n            tensor = tensor - u.reshape(-1, 1, 1) * v.reshape(\n                1, -1, 1\n            ) * w.reshape(1, 1, -1)\n        return tensor\n\n\nclass GameDataBuffer(Dataset):\n    \"\"\"Buffer to store the data from the games played by the MCTS agent.\"\"\"\n\n    def __init__(self, device: str, max_buffer_size: int):\n        \"\"\"Initializes the buffer.\n\n        Args:\n            device (str): Name of the torch device to use.\n            max_buffer_size (int): Maximum size of the buffer.\n        \"\"\"\n        self.num_games = 0\n        self.temp_dir = tempfile.mkdtemp(\"game_data_buffer\")\n        self.game_data = {}\n        self.max_buffer_size = max_buffer_size\n        self.device = device\n\n    def __del__(self):\n        shutil.rmtree(self.temp_dir)\n\n    def add_game(\n        self,\n        states: List[torch.Tensor],\n        policies: List[torch.Tensor],\n        rewards: List[torch.Tensor],\n    ):\n        \"\"\"Adds a played game to the buffer.\n\n        Args:\n            states (List[torch.Tensor]): Observed game states.\n            policies (List[torch.Tensor]): List of policies.\n            rewards (List[torch.Tensor]): Observed rewards.\n        \"\"\"\n        self.game_data[self.num_games] = len(states)\n        torch.save(\n            states, os.path.join(self.temp_dir, f\"states_{self.num_games}.pt\")\n        )\n        torch.save(\n            policies,\n            os.path.join(self.temp_dir, f\"policies_{self.num_games}.pt\"),\n        )\n        torch.save(\n            rewards,\n            os.path.join(self.temp_dir, f\"rewards_{self.num_games}.pt\"),\n        )\n        self.num_games += 1\n        if self.num_games >= self.max_buffer_size:\n            # remove oldest game. Note that this line is not thread safe. Lock\n            # should be added if multiple threads are used.\n            self.num_games = 0\n\n    def __len__(self):\n        return sum(self.game_data.values())\n\n    @torch.no_grad()\n    def __getitem__(self, idx):\n        i = 0\n        while idx >= self.game_data[i]:\n            idx -= self.game_data[i]\n            i += 1\n        states = torch.load(os.path.join(self.temp_dir, f\"states_{i}.pt\"))\n        policies = torch.load(os.path.join(self.temp_dir, f\"policies_{i}.pt\"))\n        rewards = torch.load(os.path.join(self.temp_dir, f\"rewards_{i}.pt\"))\n        return (\n            states[idx].to(self.device),\n            get_scalars(states[idx], idx, with_bs=False).to(self.device),\n            policies[idx].to(self.device).argmax(dim=-1),\n            rewards[idx].to(self.device).reshape(1),\n        )\n\n    def save_game_data(self, path: str):\n        \"\"\"Copy save_dir content in path and save game_data\n        in json format\n        \"\"\"\n        shutil.copytree(self.temp_dir, path, dirs_exist_ok=True)\n        with open(os.path.join(path, \"game_data.json\"), \"w\") as f:\n            json.dump(self.game_data, f)\n\n    def load_game_data(self, path: str):\n        \"\"\"Load game_data from json format and copy content\n        in save_dir\n        \"\"\"\n        with open(os.path.join(path, \"game_data.json\"), \"r\") as f:\n            self.game_data = json.load(f)\n        shutil.copytree(path, self.temp_dir)\n        self.num_games = len(self.game_data)\n\n\nclass TensorGameDataset(Dataset):\n    \"\"\"Dataset to be used for training the AlphaTensor algorithm using both\n    actor generated and synthetic data. A basis change can be applied to both\n    the data type with a probability specified in the constructor. The\n    synthetic data and the actor generated one are stored in two data buffers.\n    \"\"\"\n\n    def __init__(\n        self,\n        len_data,\n        pct_synth,\n        tensor_size,\n        n_synth_data,\n        limit_rank,\n        prob_distr,\n        action_memory_len: int,\n        device: str,\n        n_steps: int,\n        random_seed=None,\n    ):\n        self.synthetic_data_buffer = SyntheticDataBuffer(\n            tensor_size,\n            n_synth_data,\n            limit_rank,\n            prob_distr,\n            action_memory_len,\n            n_steps=n_steps,\n            device=device,\n            random_seed=random_seed,\n        )\n        self.game_data_buffer = GameDataBuffer(\n            device=device, max_buffer_size=100000\n        )\n        self.best_game_data_buffer = GameDataBuffer(\n            device=device, max_buffer_size=1000\n        )\n        self.len_data = len_data\n        self.pct_synth = pct_synth\n        self.pct_best_game = 0\n        self.synth_bool = torch.ones(len_data, dtype=torch.bool)\n        self.synth_idx = torch.from_numpy(\n            np.random.choice(\n                len(self.synthetic_data_buffer), len_data, replace=False\n            )\n        )\n        self.game_idx = None\n        self.best_game_idx = None\n        self.action_memory_len = action_memory_len\n        self.tensor_size = tensor_size\n        self.device = device\n\n    def change_training_split(self, pct_synth, pct_best_game):\n        self.pct_synth = pct_synth\n        self.pct_best_game = pct_best_game\n\n    def recompute_synthetic_indexes(self):\n        if len(self.game_data_buffer) > 0:\n            self.synth_bool = torch.rand(self.len_data) < self.pct_synth\n            len_synth_data = self.synth_bool.sum().item()\n            self.synth_idx = torch.from_numpy(\n                np.random.choice(\n                    len(self.synthetic_data_buffer),\n                    len_synth_data,\n                    replace=False,\n                )\n            )\n            if len(self.best_game_data_buffer) > 0 and self.pct_best_game > 0:\n                len_game_data = int(\n                    (1 - self.pct_synth - self.pct_best_game) * self.len_data\n                )\n                replace_game = len_game_data > len(self.game_data_buffer)\n                len_best_game_data = (\n                    self.len_data - len_synth_data - len_game_data\n                )\n                replace_best_game = len_best_game_data > len(\n                    self.best_game_data_buffer\n                )\n                self.game_idx = torch.from_numpy(\n                    np.random.choice(\n                        len(self.game_data_buffer),\n                        len_game_data,\n                        replace=replace_game,\n                    )\n                )\n                self.best_game_idx = torch.from_numpy(\n                    np.random.choice(\n                        len(self.best_game_data_buffer),\n                        len_best_game_data,\n                        replace=replace_best_game,\n                    )\n                )\n            else:\n                len_game_data = self.len_data - len_synth_data\n                replace_game = len_game_data > len(self.game_data_buffer)\n                self.game_idx = torch.from_numpy(\n                    np.random.choice(\n                        len(self.game_data_buffer),\n                        len_game_data,\n                        replace=replace_game,\n                    )\n                )\n\n    def __getitem__(self, idx):\n        if self.synth_bool[idx]:\n            return self.synthetic_data_buffer[\n                self.synth_idx[self.synth_bool[:idx].sum()]\n            ]\n        else:\n            if self.pct_best_game > 0 and self.best_game_idx is not None:\n                if idx - self.synth_bool[:idx].sum() < len(self.best_game_idx):\n                    return self.best_game_data_buffer[\n                        self.best_game_idx[idx - self.synth_bool[:idx].sum()]\n                    ]\n                else:\n                    return self.game_data_buffer[\n                        self.game_idx[\n                            idx\n                            - self.synth_bool[:idx].sum()\n                            - len(self.best_game_idx)\n                        ]\n                    ]\n            else:\n                return self.game_data_buffer[\n                    self.game_idx[idx - self.synth_bool[:idx].sum()]\n                ]\n\n    def __len__(self):\n        return self.len_data\n\n    def add_game(\n        self,\n        states: List[torch.Tensor],\n        policies: List[torch.Tensor],\n        rewards: List[torch.Tensor],\n    ):\n        self.game_data_buffer.add_game(states, policies, rewards)\n\n    def add_best_game(\n        self,\n        states: List[torch.Tensor],\n        policies: List[torch.Tensor],\n        rewards: List[torch.Tensor],\n    ):\n        self.best_game_data_buffer.add_game(states, policies, rewards)\n\n    def save_game_data(self, path):\n        self.game_data_buffer.save_game_data(os.path.join(path, \"game_data\"))\n        self.best_game_data_buffer.save_game_data(\n            os.path.join(path, \"best_game_data\")\n        )\n\n    def load_game_data(self, path):\n        self.game_data_buffer.load_game_data(os.path.join(path, \"game_data\"))\n        self.best_game_data_buffer.load_game_data(\n            os.path.join(path, \"best_game_data\")\n        )\n\n    @property\n    def input_tensor(self) -> torch.Tensor:\n        max_matrix_size = int(np.sqrt(self.tensor_size))\n        input_tensor = torch.zeros(\n            1,\n            self.action_memory_len + 1,\n            self.tensor_size,\n            self.tensor_size,\n            self.tensor_size,\n        )\n        matrix_dims = (\n            torch.randint(1, max_matrix_size, (3,))\n            .detach()\n            .cpu()\n            .numpy()\n            .tolist()\n        )\n        operation_tensor = self._build_tensor_game_input(\n            *matrix_dims, action_memory_len=self.action_memory_len\n        )\n\n        input_tensor[\n            0,\n            :,\n            : operation_tensor.shape[1],\n            : operation_tensor.shape[2],\n            : operation_tensor.shape[3],\n        ] = operation_tensor\n        return input_tensor.to(self.device)\n\n    @staticmethod\n    def _build_tensor_game_input(\n        dim_1: int, dim_k: int, dim_2: int, action_memory_len: int\n    ):\n        \"\"\"Build the input tensor for the game. The input tensor has shape\n        (action_memory_len+1, matrix_size**2, matrix_size**2, matrix_size**2).\n        The first slice represent the matrix multiplication tensor which will\n        be reduced by the TensorGame algorithm. The other slices represent the\n        action memory.\n        \"\"\"\n        input_tensor = torch.zeros(\n            action_memory_len + 1, dim_1 * dim_k, dim_k * dim_2, dim_1 * dim_2\n        )\n        for r in range(dim_1 * dim_2):\n            for k in range(dim_k):\n                input_tensor[\n                    0, (r // dim_2) * dim_k + k, k * dim_2 + r % dim_2, r\n                ] = 1\n        return input_tensor\n\n    def games_are_good(self):\n        return False\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/data/generation.py",
    "content": "from typing import Callable\n\nimport torch\n\n\ndef generate_synthetic_data(\n    tensor_size: int,\n    n_data: int,\n    limit_rank: int,\n    prob_distr: Callable = torch.randn,\n    random_seed: int = None,\n):\n    \"\"\"Generates synthetic demonstrations.\n\n    Args:\n        tensor_size (int): Size of the tensor.\n        n_data (int): Number of demonstrations.\n        limit_rank (int): Limit rank of each tensor.\n        prob_distr (Callable, optional): Distribution of the entries of the\n        tensor.\n        random_seed (int, optional): Random seed for reproducibility.\n    \"\"\"\n    if random_seed is not None:\n        torch.random.manual_seed(random_seed)\n    for _ in range(n_data):\n        # rank = torch.randint(low=1, high=limit_rank + 1, size=(1,)).item()\n        rank = limit_rank\n        output_tensor = torch.zeros(tensor_size, tensor_size, tensor_size)\n        list_of_triplets = []\n        for i in range(rank):\n            valid_triplet = False\n            while not valid_triplet:\n                u = prob_distr(tensor_size)\n                v = prob_distr(tensor_size)\n                w = prob_distr(tensor_size)\n                generated_tensor = (\n                    u.reshape(-1, 1, 1)\n                    * v.reshape(1, -1, 1)\n                    * w.reshape(1, 1, -1)\n                )\n                if not (generated_tensor == 0).all():\n                    valid_triplet = True\n                    list_of_triplets.append((u, v, w))\n                    output_tensor += generated_tensor\n        yield output_tensor, list_of_triplets\n\n\ndef f_prob_distribution(size):\n    \"\"\"Samples a tensor of values from a distribution with a peak at 0 and a\n    tail at -2 and 2.\n\n    Args:\n        size (int): Number of values to sample.\n    \"\"\"\n    f_vals = torch.tensor([-2, -1, 0, 1, 2])\n    f_probs = torch.tensor([0.001, 0.099, 0.8, 0.099, 0.001]).unsqueeze(0)\n    f_cum_sum = torch.cumsum(f_probs, dim=-1)\n    unif_prob = torch.rand((size, 1))\n    tensor_idx = torch.argmax((unif_prob <= f_cum_sum).int(), dim=1)\n    tensor = f_vals[tensor_idx]\n    return tensor\n\n\ndef z2_prob_distribution(size):\n    \"\"\"Samples a binary tensor with uniform probability of 0 and 1.\n\n    Args:\n        size (int): Number of values to sample.\n    \"\"\"\n    return (torch.rand(size) > 0.5).int()\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/data/utils.py",
    "content": "from typing import Tuple\n\nimport torch\n\n\ndef get_scalars(input_tensor: torch.Tensor, t_step: int, with_bs: bool = True):\n    \"\"\"Adds the time step to the current state tensor.\n\n    Args:\n        input_tensor (torch.Tensor): Current state tensor.\n        t_step (int): Current time step.\n        with_bs (bool, optional): Whether the batch size is present in the\n        input tensor.\n    \"\"\"\n    # scalars containing the iteration time\n    if with_bs:\n        bs = input_tensor.shape[0]\n        scalars = torch.zeros((bs, 1))\n        scalars[:, 0] = t_step\n    else:\n        scalars = torch.tensor(t_step).unsqueeze(-1).float()\n    return scalars\n\n\ndef map_triplet_to_action(\n    triplet: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],\n    base: int,\n    n_steps: int,\n    add_bias: bool = True,\n):\n    \"\"\"Maps a triplet of tensors to an action.\n\n    Args:\n        triplet (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Triplet of\n        tensors u, v, and w.\n        base (int): Base used for the conversion.\n        n_steps (int): Number of steps in the action.\n        add_bias (bool, optional): Whether to add a bias to the action.\n    \"\"\"\n    # map the triplet to an action. First, we concatenate the three tensors and\n    # then we convert it to an action using the given base representation. Each\n    # element is converted using the formula:\n    #   action += element * base^(element_index)\n    u, v, w = triplet\n    n_dim = u.ndim\n    action = torch.cat((u, v, w), dim=-1)\n    action = action.reshape(-1, n_steps, action.shape[-1] // n_steps)\n    if n_dim == 1:\n        action = action.squeeze(0)\n    if add_bias:\n        action = action + base // 2\n    action = action * torch.tensor(\n        [base**i for i in range(action.shape[-1])]\n    )\n    action = action.sum(dim=-1)\n    return action\n\n\n# @torch.jit.script\ndef _single_action_to_triplet(\n    action_val: int,\n    basis: int,\n    out_dim: int,\n    bias: int,\n    device: str,\n):\n    \"\"\"Converts an action to the original triplet (u, v, w) that generated it.\n\n    Args:\n        action_val (int): Action to convert.\n        basis (int): Basis used for the conversion.\n        out_dim (int): Output dimension.\n        bias (int): Bias to subtract from the action.\n        device (str): Name of the torch device to use.\n    \"\"\"\n    triplet = torch.zeros(out_dim).to(device)\n    if action_val > 0:\n        idx = int(\n            torch.log(torch.tensor(action_val))\n            // torch.log(torch.tensor(basis))\n        )\n    else:\n        idx = 0\n    while idx >= 0:\n        temp = int(basis**idx)\n        triplet[idx] = action_val // temp - bias\n        action_val = action_val - temp\n        idx -= 1\n    return triplet\n\n\ndef map_action_to_triplet(\n    action_tensor: torch.Tensor,\n    cardinality: int = 5,\n    vector_size: int = 5,\n    add_bias: bool = True,\n):\n    \"\"\"Maps a batch of actions to the batch of triplets that generated them.\n\n    Args:\n        action_tensor (torch.Tensor): Batch of actions.\n        cardinality (int, optional): Cardinality of the action space.\n        vector_size (int, optional): Size of the vector.\n        add_bias (bool, optional): Whether to use bias.\n    \"\"\"\n    # map the action to a triplet. The action is converted to a base 5\n    # representation and then the three elements are extracted from it.\n    # The action has shape (bs, n_steps) and it contains the token for\n    # recreating u, v and w. The token is a number between 0 and n_logits.\n    action_shape = action_tensor.shape\n    action_tensor = action_tensor.reshape(-1)\n    if add_bias:\n        bias = cardinality // 2\n    else:\n        bias = 0\n    triplets = torch.stack(\n        [\n            _single_action_to_triplet(\n                action_tensor[idx],\n                cardinality,\n                vector_size,\n                bias,\n                action_tensor.device,\n            )\n            for idx in range(len(action_tensor))\n        ]\n    )\n    final_size = triplets.shape[-1]\n    return triplets.reshape((*action_shape, final_size))\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/modules/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/modules/alpha_tensor.py",
    "content": "import torch\n\nfrom open_alpha_tensor.core.modules.extras import (\n    QuantileLoss,\n    ValueRiskManagement,\n)\nfrom open_alpha_tensor.core.modules.heads import PolicyHead, ValueHead\nfrom open_alpha_tensor.core.modules.torso import TorsoModel\n\n\nclass AlphaTensorModel(torch.nn.Module):\n    def __init__(\n        self,\n        tensor_length: int,\n        input_size: int,\n        scalars_size: int,\n        emb_dim: int,\n        n_steps: int,\n        n_logits: int,\n        n_samples: int,\n    ):\n        # scalar_size = s\n        # input_size = S\n        # tensor_length = T\n        # emb_dim = c\n        super().__init__()\n        self.tensor_length = tensor_length\n        self.input_size = input_size\n        self.emb_dim = emb_dim\n        self.torso = TorsoModel(\n            scalars_size, input_size, tensor_length, emb_dim\n        )\n        emb_size = 3 * input_size * input_size\n        print(\"Build policy head\")\n        self.policy_head = PolicyHead(\n            emb_size, emb_dim, n_steps, n_logits, n_samples\n        )\n        print(\"Build value head\")\n        self.value_head = ValueHead(\n            2048\n        )  # value dependent on num_head and proj_dim\n        self.policy_loss_fn = torch.nn.CrossEntropyLoss(reduction=\"sum\")\n        self.quantile_loss_fn = QuantileLoss()\n        self.risk_value_management = ValueRiskManagement()\n\n    @property\n    def device(self):\n        return next(self.parameters()).device\n\n    def _train_forward(\n        self,\n        x: torch.Tensor,\n        s: torch.Tensor,\n        g_action: torch.Tensor,\n        g_value: torch.Tensor,\n    ):\n        # shapes\n        # x = (N, T, S, S, S)\n        # s = (N, s)\n        # g_action = (N, N_steps)\n        # g_value = (N, )\n        e = self.torso(x, s)\n        o, z1 = self.policy_head(e, g_action)\n        l_policy = self.policy_loss_fn(\n            o.reshape(-1, o.shape[-1]), g_action.reshape(-1)\n        )\n        q = self.value_head(z1)\n        l_value = self.quantile_loss_fn(q, g_value.float())\n        return l_policy, l_value\n\n    def _eval_forward(self, x: torch.Tensor, s: torch.Tensor):\n        e = self.torso(x, s)\n        a, p, z1 = self.policy_head(e)\n        q = self.value_head(z1)\n        q = self.risk_value_management(q)\n        return a, p, q\n\n    def forward(\n        self,\n        x: torch.Tensor,\n        s: torch.Tensor,\n        g_action: torch.Tensor = None,\n        g_value: torch.Tensor = None,\n    ):\n        if g_action is None:\n            return self._eval_forward(x, s)\n        else:\n            assert g_value is not None\n            return self._train_forward(x, s, g_action, g_value)\n\n    @property\n    def n_logits(self):\n        return self.policy_head.n_logits\n\n    @property\n    def n_steps(self):\n        return self.policy_head.n_steps\n\n    @property\n    def n_samples(self):\n        return self.policy_head.n_samples\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/modules/attention.py",
    "content": "import torch\nfrom torch.nn import functional as F\n\n\nclass AttentionHead(torch.nn.Module):\n    def __init__(self, x_size: int, y_size: int, proj_dim: int):\n        # x_size = N_x\n        # y_size = N_y\n        super(AttentionHead, self).__init__()\n        self.proj_dim = proj_dim\n        self.proj_dim_isqrt = 1 / torch.sqrt(torch.tensor(proj_dim))\n        self.queries_proj_layer = torch.nn.Linear(x_size, proj_dim)\n        self.keys_proj_layer = torch.nn.Linear(y_size, proj_dim)\n        self.values_proj_layer = torch.nn.Linear(y_size, proj_dim)\n\n    def forward(self, x: torch.Tensor, y: torch.Tensor, mask: bool = False):\n        queries = self.queries_proj_layer(x)\n        keys = self.keys_proj_layer(y)\n        values = self.values_proj_layer(y)\n        attention = F.softmax(\n            torch.matmul(queries, keys.transpose(-2, -1))\n            * self.proj_dim_isqrt,\n            dim=-1,\n        )\n        if mask:\n            attention = torch.triu(attention, diagonal=1)\n        output = torch.matmul(attention, values)\n        return output\n\n\nclass AttentionDenseBlock(torch.nn.Module):\n    def __init__(self, inner_size: int, multiplier: int = 4):\n        super().__init__()\n        self.norm = torch.nn.LayerNorm(inner_size)\n        self.linear = torch.nn.Linear(inner_size, inner_size * multiplier)\n        self.activation = torch.nn.GELU()\n        self.linear_final = torch.nn.Linear(\n            inner_size * multiplier, inner_size\n        )\n\n    def forward(self, x: torch.Tensor):\n        x_temp = self.activation(self.linear(self.norm(x)))\n        return x + self.linear_final(x_temp)\n\n\nclass AlphaMultiHeadAttention(torch.nn.Module):\n    def __init__(\n        self,\n        x_dim: int,\n        y_dim: int,\n        proj_dim: int = 32,\n        n_heads: int = 16,\n        multiplier: int = 4,\n    ):\n        # x_dim = size of the last dimension of x\n        # y_dim = size of the last dimension of y\n        super().__init__()\n        self.norm_layer_x = torch.nn.LayerNorm(x_dim)\n        self.norm_layer_y = torch.nn.LayerNorm(y_dim)\n        self.module_list = torch.nn.ModuleList(\n            [AttentionHead(x_dim, y_dim, proj_dim) for _ in range(n_heads)]\n        )\n        self.linear = torch.nn.Linear(n_heads * proj_dim, x_dim)\n\n        self.dense = AttentionDenseBlock(x_dim, multiplier)\n\n    def forward(\n        self, x: torch.nn.Module, y: torch.nn.Module, mask: bool = False\n    ):\n        # x.size = (Nx, c1), y.size = (Ny, c2)\n        x_norm = self.norm_layer_x(x)\n        y_norm = self.norm_layer_y(y)\n        temp = torch.cat(\n            [layer(x_norm, y_norm, mask) for layer in self.module_list], dim=-1\n        )\n        x = x + self.linear(temp)\n        return self.dense(x)\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/modules/extras.py",
    "content": "import torch\n\n\nclass QuantileLoss(torch.nn.Module):\n    def __init__(self, delta: float = 1.0):\n        super().__init__()\n        self.huber_loss = torch.nn.HuberLoss(reduction=\"none\", delta=delta)\n\n    def forward(self, q: torch.Tensor, g: torch.Tensor):\n        n = q.shape[-1]\n        tau = torch.arange(0, n).unsqueeze(0).to(q.device) / n\n        h = self.huber_loss(g, q)\n        k = torch.abs(tau - (g - q > 0).float())\n        return torch.mean(h * k)\n\n\nclass ValueRiskManagement(torch.nn.Module):\n    def __init__(self, u_q: float = 0.75):\n        super(ValueRiskManagement, self).__init__()\n        self.u_q = u_q\n\n    def forward(self, q: torch.Tensor):\n        # q shape = (N, n)\n        j = int(self.u_q * q.shape[-1])\n        return torch.mean(q[:, j:], dim=-1)\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/modules/heads.py",
    "content": "import math\n\nimport torch\nimport torch.nn.functional as F\n\nfrom open_alpha_tensor.core.modules.attention import AlphaMultiHeadAttention\n\n\nclass PositionEncoding(torch.nn.Module):\n    def __init__(self, d_model: int, max_len: int = 5000):\n        super().__init__()\n\n        position = torch.arange(max_len).unsqueeze(1)\n        div_term = torch.exp(\n            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)\n        )\n        pe = torch.zeros(max_len, 1, d_model)\n        pe[:, 0, 0::2] = torch.sin(position * div_term)\n        pe[:, 0, 1::2] = torch.cos(position * div_term)\n        self.register_buffer(\"pe\", pe)\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            x: Tensor, shape [seq_len, batch_size, embedding_dim]\n        \"\"\"\n        x = x + self.pe[: x.size(0)]\n        return x\n\n\nclass PolicyHeadDoubleAttention(torch.nn.Module):\n    def __init__(\n        self,\n        n_steps: int,\n        n_heads: int,\n        n_feat: int,\n        emb_size: int,\n        emb_dim: int,\n    ):\n        super().__init__()\n        d_model = n_feat * n_heads\n        self.layer_norm1 = torch.nn.LayerNorm(d_model)\n        self.attention1 = AlphaMultiHeadAttention(d_model, d_model)\n        self.drop1 = torch.nn.Dropout()\n        self.layer_norm2 = torch.nn.LayerNorm(d_model)\n        self.attention2 = AlphaMultiHeadAttention(d_model, emb_dim)\n        self.drop2 = torch.nn.Dropout()\n\n    def forward(self, x: torch.Tensor, e: torch.Tensor):\n        x = self.layer_norm1(x)\n        c = self.attention1(x, x, mask=True)\n        c = self.drop1(c)\n        x = x + c\n        x = self.layer_norm2(x)\n        c = self.attention2(x, e, mask=False)\n        c = self.drop2(c)\n        x = x + c\n        return x\n\n\nclass PolicyHeadCore(torch.nn.Module):\n    def __init__(\n        self,\n        emb_size: int,\n        emb_dim: int,\n        n_steps: int,\n        n_logits: int,\n        n_feat: int = 64,\n        n_heads: int = 32,\n        n_layers: int = 2,\n    ):\n        super().__init__()\n        self.embedding = torch.nn.Embedding(n_logits, n_feat * n_heads)\n        self.position_encoding = PositionEncoding(n_feat * n_heads)\n        self.decoders = torch.nn.ModuleList(\n            [\n                PolicyHeadDoubleAttention(\n                    n_steps, n_heads, n_feat, emb_size, emb_dim\n                )\n                for _ in range(n_layers)\n            ]\n        )\n        self.relu = torch.nn.ReLU()\n        self.linear2 = torch.nn.Linear(n_feat * n_heads, n_logits)\n\n    def forward(self, a: torch.Tensor, e: torch.Tensor):\n        x = self.position_encoding(self.embedding(a))\n        for layer in self.decoders:\n            x = layer(x, e)\n        o = self.linear2(self.relu(x))\n        return o, x\n\n\ndef sample_from_logits(a):\n    # returns a sampled element and the associated probability\n    # since cross entropy is run during training we expect logits\n    # to be probabilities yet.\n    probs = torch.cumsum(F.softmax(a, dim=-1), dim=-1)\n    random_vals = torch.rand(probs.shape[0]).unsqueeze(-1).to(a.device)\n    n_classes = a.shape[-1]\n    new_a_idx = torch.argmax(1.0 * (probs > random_vals), dim=-1)\n    index_bias = torch.arange(0, len(new_a_idx)).to(a.device) * n_classes\n    probs = torch.take(probs, new_a_idx + index_bias)\n    # new_a = F.one_hot(new_a_idx, n_classes)\n    return new_a_idx, probs\n\n\nclass PolicyHead(torch.nn.Module):\n    def __init__(\n        self,\n        emb_size: int,\n        emb_dim: int,\n        n_steps: int,\n        n_logits: int,\n        n_samples: int,\n    ):\n        super().__init__()\n        self.n_logits = n_logits\n        self.n_samples = n_samples\n        self.n_steps = n_steps\n        self.core = PolicyHeadCore(emb_size, emb_dim, n_steps, n_logits)\n\n    def _train_forward(self, e: torch.Tensor, g: torch.Tensor):\n        # e is the embedding, shape = (N, m, c)\n        # g represents the previous actions, when training it represents the\n        # list of correct actions, thus we need to shift them (since we do not\n        # want to consider also the latest, correct action when predicting).\n        # g has shape (N, N_steps) and it is a one-hot encoding of N_logits\n        g = torch.roll(g, shifts=-1, dims=1)\n        # the first raw will have attention zero during training\n        # g = F.one_hot(g, self.n_logits).float()\n        o, z = self.core(g, e)\n        return o, z[:, 0]\n\n    def _eval_forward(self, e: torch.Tensor):\n        bs = e.shape[0]\n        future_g = (\n            torch.zeros((bs, self.n_samples, self.n_steps)).long().to(e.device)\n        )\n        ps = torch.ones((bs, self.n_samples)).to(e.device)\n        e = e.unsqueeze(1).repeat(1, self.n_samples, 1, 1)\n\n        future_g = future_g.view(-1, self.n_steps)\n        ps = ps.view(-1)\n        e = e.view(-1, e.shape[-2], e.shape[-1])\n        for i in range(self.n_steps):\n            o_s, z_s = self.core(future_g[:, : i + 1], e)\n            future_g[:, i], p_i = sample_from_logits(o_s[:, i])\n            ps *= p_i\n        future_g = future_g.view(bs, self.n_samples, self.n_steps)\n        ps = ps.view(bs, self.n_samples)\n        return (\n            future_g,\n            ps,\n            z_s[:, 0].view(bs, self.n_samples, *z_s.shape[2:]).mean(1),\n        )\n\n    def forward(self, e: torch.Tensor, g: torch.Tensor = None):\n        if g is None:\n            return self._eval_forward(e)\n        return self._train_forward(e, g)\n\n\nclass ValueHeadCore(torch.nn.Module):\n    def __init__(self, input_size: int, output_size: int):\n        super().__init__()\n        self.linear = torch.nn.Linear(input_size, output_size)\n        self.relu = torch.nn.ReLU()\n\n    def forward(self, x: torch.Tensor):\n        return self.relu(self.linear(x))\n\n\nclass ValueHead(torch.nn.Module):\n    def __init__(\n        self, input_size: int, hidden_size: int = 512, output_size: int = 8\n    ):\n        super().__init__()\n        self.layers = torch.nn.Sequential(\n            *(\n                [ValueHeadCore(input_size, hidden_size)]\n                + [ValueHeadCore(hidden_size, hidden_size)] * 2\n            )\n        )\n        self.linear = torch.nn.Linear(hidden_size, output_size)\n\n    def forward(self, x: torch.Tensor):\n        return self.linear(self.layers(x))\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/modules/torso.py",
    "content": "import torch\n\nfrom open_alpha_tensor.core.modules.attention import AlphaMultiHeadAttention\n\n\nclass TorsoAttentiveModes(torch.nn.Module):\n    def __init__(self, input_dim: int):\n        # input_dim = c\n        super().__init__()\n        self.attention = AlphaMultiHeadAttention(\n            input_dim,\n            input_dim,\n        )\n\n    def forward(self, x1, x2, x3):\n        # x1.size = x2.size = x3.size = (N, S, S, c)\n        # where N is the batch size\n        size = x1.shape[-2]\n        input_list = [x1, x2, x3]\n        for m1, m2 in [(0, 1), (2, 0), (1, 2)]:\n            matrix = torch.cat([input_list[m1], input_list[m2]], dim=-2)\n            # matrix_size = (N, S, 2S, c)\n            out = self.attention(matrix, matrix)\n            input_list[m1] = out[:, :, :size]\n            input_list[m2] = out[:, :, size:]\n        return input_list\n\n\nclass TorsoModel(torch.nn.Module):\n    \"\"\"Torso model of OpenAlphaTensor.\n\n    It maps an input tensor of shape (N, T, S, S, S) to (N, 3S*S, c), where:\n\n        N is the batch size;\n        T is the context size (size of the history + 1);\n        S is the number of elements in each matrix to be multiplied;\n        c is the output dimensionality.\n    \"\"\"\n\n    def __init__(\n        self,\n        scalars_size: int,\n        input_size: int,\n        tensor_length: int,\n        out_size: int,\n    ):\n        # scalar_size = s\n        # input_size = S\n        # tensor_length = T\n        # out_size = c\n        super(TorsoModel, self).__init__()\n        self.linears_1 = torch.nn.ModuleList(\n            [\n                torch.nn.Linear(scalars_size, input_size * input_size)\n                for _ in range(3)\n            ]\n        )\n        self.linears_2 = torch.nn.ModuleList(\n            [\n                torch.nn.Linear(input_size * tensor_length + 1, out_size)\n                for _ in range(3)\n            ]\n        )\n        self.attentive_modes = torch.nn.ModuleList(\n            [TorsoAttentiveModes(out_size) for _ in range(8)]\n        )\n\n    def forward(self, x: torch.Tensor, scalars: torch.Tensor):\n        # x.size = (N, T, S, S, S)\n        # scalars.size = (N, s)\n        batch_size = x.shape[0]\n        S = x.shape[-1]\n        T = x.shape[1]\n        x1 = x.permute(0, 2, 3, 4, 1).reshape(batch_size, S, S, S * T)\n        x2 = x.permute(0, 4, 2, 3, 1).reshape(batch_size, S, S, S * T)\n        x3 = x.permute(0, 3, 4, 2, 1).reshape(batch_size, S, S, S * T)\n        input_list = [x1, x2, x3]\n        for i in range(3):\n            temp = self.linears_1[i](scalars).reshape(batch_size, S, S, 1)\n            input_list[i] = torch.cat([input_list[i], temp], dim=-1)\n            input_list[i] = self.linears_2[i](input_list[i])\n        x1, x2, x3 = input_list\n        for layer in self.attentive_modes:\n            x1, x2, x3 = layer(x1, x2, x3)\n        return torch.stack([x1, x2, x3], dim=2).reshape(\n            batch_size, 3 * S * S, -1\n        )\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/core/training.py",
    "content": "from pathlib import Path\nfrom typing import Tuple, List\n\nimport torch.optim\nimport tqdm\nfrom torch.utils.data import DataLoader\n\nfrom open_alpha_tensor.config import (\n    BASE_CHECKPOINT_DATA_DIR,\n    BASE_CHECKPOINT_DIR,\n)\nfrom open_alpha_tensor.core.actors.stage import actor_prediction\nfrom open_alpha_tensor.core.data.basis_change import ChangeOfBasis\nfrom open_alpha_tensor.core.data.dataset import TensorGameDataset\nfrom open_alpha_tensor.core.data.generation import f_prob_distribution\nfrom open_alpha_tensor.core.data.utils import map_action_to_triplet\nfrom open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel\n\n\n@torch.no_grad()\ndef _single_act(\n    actor_id: int,\n    model: torch.nn.Module,\n    input_tensor: torch.Tensor,\n    device: str,\n    mc_n_sim: int,\n    N_bar: int,\n    cob: ChangeOfBasis,\n    max_rank: int,\n):\n    \"\"\"Executes an episode for a single actor using the MCTS.\n    The method is called multiple times in parallel with different actor ids.\n\n    Args:\n        actor_id (int): The id of the actor.\n        model (torch.nn.Module): The model used to take the action.\n        input_tensor (torch.Tensor): State of the game.\n        device (str): The name of the torch device used for training.\n        mc_n_sim (int): Number of simulations during Monte Carlo tree search.\n        N_bar (int): N_bar parameter used to compute tau when improving the\n        policy.\n        cob (ChangeOfBasis): The change of basis used to generate the input\n        tensor.\n        max_rank (int): The maximum matrix rank achieved by the actor before\n        tree search is stopped.\n    \"\"\"\n    print(f\"Acting with actor {actor_id}\")\n    model.to(device)\n    cob.device = device\n    input_tensor = input_tensor.to(device)\n    input_tensor_cob = cob(input_tensor)\n    states, policies, rewards = actor_prediction(\n        model, input_tensor_cob, max_rank, mc_n_sim, N_bar\n    )\n    print(f\"Actor {actor_id} finished\")\n    states = [s.to(\"cpu\") for s in states]\n    policies = policies.to(\"cpu\")\n    rewards = rewards.to(\"cpu\")\n    return actor_id, states, policies, rewards\n\n\ndef swap_data(\n    states: List[torch.Tensor],\n    actions: List[torch.Tensor],\n):\n    \"\"\"Swaps the last action with a random one and updates the states\n    accordingly for a single game.\n\n    Args:\n        states (List[torch.Tensor]): All the states for a single game.\n        actions (List[torch.Tensor]): All the actions through the game.\n    \"\"\"\n    last_action = actions[-1]\n    swap_index = torch.randint(0, len(states) - 1, (1,)).item()\n    actions[-1] = actions[swap_index]\n    actions[swap_index] = last_action\n\n    actual_state = states[swap_index]\n    for i in range(swap_index + 1, len(states) + 1):\n        prev_action = actions[i - 1]\n        triplet = map_action_to_triplet(\n            prev_action, vector_size=actual_state.shape[-1]\n        )\n        vector_size = actual_state.shape[-1] // 3\n        bs = actual_state.shape[0]\n        u = triplet[:, :vector_size].reshape(bs, -1, 1, 1)\n        v = triplet[:, vector_size : 2 * vector_size].reshape(  # noqa E203\n            bs, 1, -1, 1\n        )\n        w = triplet[:, 2 * vector_size :].reshape(bs, 1, 1, -1)  # noqa E203\n        reduced_state = u * v * w\n        fut_state = actual_state[:, 0] - reduced_state\n        new_state = actual_state[:, 1:].roll(1, dims=1)\n        new_state[:, 0] = reduced_state\n        actual_state = torch.cat([fut_state, new_state], dim=1)\n        states[i] = actual_state\n    return states, actions\n\n\nclass Trainer:\n    \"\"\"Trainer for the AlphaTensor model. The trainer does not require an\n    explicit loss since the loss is computed by the model itself. The trainer\n    is responsible for both the training step and the acting one, storing\n    acting performance in a buffer.\n    \"\"\"\n\n    def __init__(\n        self,\n        model: AlphaTensorModel,\n        tensor_size: int,\n        n_steps: int,\n        batch_size: int,\n        optimizer: torch.optim.Optimizer,\n        device: str,\n        len_data: int,\n        pct_synth: float,\n        n_synth_data: int,\n        limit_rank: int,\n        n_cob: int,\n        cob_prob: float,\n        data_augmentation: bool,\n        loss_params: Tuple[float, float] = None,\n        random_seed: int = None,\n        checkpoint_dir: str = None,\n        checkpoint_data_dir: Path = None,\n        extra_devices: List[str] = None,\n    ):\n        \"\"\"Initializes the trainer.\n\n        Args:\n            model (AlphaTensorModel): The model to train.\n            tensor_size (int): Flattened size of the matrices to be multiplied.\n            n_steps (int): Number of steps used to get a single action out of\n            a triplet.\n            batch_size (int): Batch size.\n            optimizer (torch.optim.Optimizer): The optimizer used to train the\n            model.\n            device (str): The name of the torch device used for training.\n            len_data (int): Number of training samples used (both actor\n            generated and synthetic).\n            pct_synth (float): Initial percentage of synthetic samples used\n            for training.\n            n_synth_data (int): Number of synthetic training samples.\n            limit_rank (int): Maximum rank for synthetically-generated\n            matrices.\n            n_cob (int): Number of change of basis (cob) used for a single\n            training sample.\n            cob_prob (float): Probability of applying a change of basis.\n            data_augmentation (bool): Whether to randomly swap the last\n            operation of an episode with another operation.\n            loss_params (Tuple[float, float]): Alpha and Beta parameters used\n            in the loss function.\n            random_seed (int): Randomizing seed.\n            checkpoint_dir (str): Directory used to store model checkpoints.\n            checkpoint_data_dir (str): Directory used to store games as JSON\n            files.\n            extra_devices (List[str]): Extra devices names used for multi-GPU\n            training.\n        \"\"\"\n        self.model = model\n        self.optimizer = optimizer\n        self.device = device\n        self.dataset = TensorGameDataset(\n            len_data,\n            pct_synth,\n            tensor_size,\n            n_synth_data,\n            limit_rank,\n            f_prob_distribution,\n            device=device,\n            n_steps=n_steps,\n            action_memory_len=(model.tensor_length - 1),\n            random_seed=random_seed,\n        )\n        self.batch_size = batch_size\n        self.max_rank = limit_rank\n        if loss_params is None:\n            self.alpha = 1\n            self.beta = 1\n        else:\n            self.alpha, self.beta = loss_params\n        self.checkpoint_dir = Path(\n            checkpoint_dir if checkpoint_dir else BASE_CHECKPOINT_DIR\n        )\n        self.checkpoint_dir.mkdir(exist_ok=True, parents=True)\n        self.checkpoint_data_dir = (\n            checkpoint_data_dir\n            if checkpoint_data_dir\n            else Path(BASE_CHECKPOINT_DATA_DIR)\n        )\n        self.checkpoint_data_dir.mkdir(exist_ok=True, parents=True)\n        self.change_of_basis = ChangeOfBasis(\n            tensor_size, n_cob, cob_prob, device, random_seed\n        )\n        self.data_augmentation = data_augmentation\n        self.extra_devices = extra_devices\n\n    def train_step(self):\n        \"\"\"Executes a single training step by optimizing the current model\n        parameters.\"\"\"\n        self.dataset.recompute_synthetic_indexes()\n        self.model.train()\n        total_loss = 0\n        dl = DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True)\n        print(\"Training AlphaTensor\")\n        for states, scalars, policies, rewards in tqdm.tqdm(dl):\n            loss_policy, loss_value = self.model(\n                states, scalars, policies, rewards\n            )\n            loss = self.alpha * loss_policy + self.beta * loss_value\n            self.optimizer.zero_grad()\n            loss.backward()\n            self.optimizer.step()\n            total_loss += loss.item()\n        print(f\"Total loss: {total_loss}\")\n\n    @torch.no_grad()\n    def act_step(\n        self,\n        input_tensor: torch.Tensor,\n        n_games: int,\n        mc_n_sim: int,\n        N_bar: int,\n    ):\n        \"\"\"Runs actors in parallel to generate multiple games starting from\n        the same input tensor.\n\n        Args:\n            input_tensor (torch.Tensor): The input tensor used to generate the\n            games.\n            n_games (int): Number of games to generate / actors to be run in\n            parallel.\n            mc_n_sim (int): Number of simulations used in the Monte Carlo tree\n            search.\n            N_bar (int): N_bar parameter used to compute tau when improving\n            the policy.\n        \"\"\"\n        self.model.eval()\n        best_reward = -1e10\n        best_game = None\n\n        if self.extra_devices:\n            from joblib import Parallel, delayed\n\n            # this means that there is an empty GPU available\n            # thus we can use it to parallelize the acting step\n            # use joblib to parallelize the acting step\n            # we should use _single_act as a function to be parallelized\n            extra_devices = (\n                self.extra_devices * (n_games // len(self.extra_devices))\n                + self.extra_devices[: n_games % len(self.extra_devices)]\n            )\n            self.model.to(\"cpu\")\n            input_tensor = input_tensor.to(\"cpu\")\n\n            print(f\"Starting acting phase with {n_games} games\")\n            results = Parallel(n_jobs=len(self.extra_devices))(\n                delayed(_single_act)(\n                    actor_id,\n                    self.model,\n                    input_tensor,\n                    extra_devices[actor_id],\n                    mc_n_sim,\n                    N_bar,\n                    self.change_of_basis,\n                    self.max_rank,\n                )\n                for actor_id in range(n_games)\n            )\n            self.model.to(self.device)\n\n            for actor_id, states, policies, rewards in results:\n                if rewards[-1] > best_reward:\n                    print(f\"New best actor! Actor: {actor_id}\")\n                    best_reward = rewards[-1]\n                    best_game = (states, policies, rewards)\n                self.dataset.add_game(states, policies, rewards)\n                if self.data_augmentation:\n                    states, policies = swap_data(states, policies)\n                    self.dataset.add_game(states, policies, rewards)\n            if best_game is not None:\n                self.dataset.add_best_game(*best_game)\n        else:\n            for actor_id in range(n_games):\n                input_tensor_cob = self.change_of_basis(input_tensor).to(\n                    self.device\n                )\n                print(f\"Running actor {actor_id} / {n_games}\")\n                states, policies, rewards = actor_prediction(\n                    self.model,\n                    input_tensor_cob,\n                    self.max_rank,\n                    mc_n_sim,\n                    N_bar,\n                )\n                print(\n                    f\"Actor {actor_id} finished. Final reward: {rewards[-1]}\"\n                )\n                if rewards[-1] > best_reward:\n                    print(\"New best actor!\")\n                    best_reward = rewards[-1]\n                    best_game = (states, policies, rewards)\n                self.dataset.add_game(states, policies, rewards)\n                if self.data_augmentation:\n                    states, policies = swap_data(states, policies)\n                    self.dataset.add_game(states, policies, rewards)\n            if best_game is not None:\n                self.dataset.add_best_game(*best_game)\n\n    def train(\n        self,\n        n_epochs: int,\n        n_games: int,\n        mc_n_sim: int,\n        N_bar: int,\n        initial_lr: float,\n        lr_decay_factor: float,\n        lr_decay_steps: int,\n        starting_epoch: int = 0,\n    ):\n        \"\"\"Trains the model for a given number of epochs.\n\n        Args:\n            n_epochs (int): Number of training epochs.\n            n_games (int): Number of games to generate / actors to be run in\n            parallel at each step.\n            mc_n_sim (int): Number of simulations used in the Monte Carlo tree\n            search at each step.\n            N_bar (int): N_bar parameter used to compute tau when improving\n            the policy.\n            initial_lr (float): Initial learning rate.\n            lr_decay_factor (float): Learning rate's decay factor.\n            lr_decay_steps (int): Number of learning rate's decay steps.\n            starting_epoch (int, optional): Epoch from which to start / resume\n            training.\n        \"\"\"\n        self.model = self.model.to(self.device)\n        if starting_epoch + 1 > n_epochs // 50:\n            self.dataset.change_training_split(0.7, 0.05)\n        if (\n            starting_epoch + 1 > n_epochs // 10\n        ):  # when restarting from a checkpoint\n            mc_n_sim = mc_n_sim * 4\n        for epoch in range(starting_epoch, n_epochs):\n            if epoch + 1 == n_epochs // 50:\n                self.dataset.change_training_split(0.7, 0.05)\n            if epoch + 1 == n_epochs // 10:\n                mc_n_sim = mc_n_sim * 4\n            # apply learning rate decay each epoch if epoch < lr_decay_steps\n            if 0 < epoch < lr_decay_steps - 1:\n                lr = initial_lr * lr_decay_factor ** (epoch / lr_decay_steps)\n                for param_group in self.optimizer.param_groups:\n                    param_group[\"lr\"] = lr\n\n            print(f\"Epoch {epoch} / {n_epochs}\")\n            self.train_step()\n            if epoch % 10 == 0:\n                self.act_step(\n                    self.dataset.input_tensor, n_games, mc_n_sim, N_bar\n                )\n            # save checkpoint\n            if (epoch + 1) % 100 == 0:\n                checkpoint_name = f\"checkpoint_{epoch + 1}.pt\"\n                checkpoint = {\n                    \"model_state_dict\": self.model.state_dict(),\n                    \"optimizer_state_dict\": self.optimizer.state_dict(),\n                }\n                torch.save(\n                    checkpoint,\n                    self.checkpoint_dir / checkpoint_name,\n                )\n                self.dataset.save_game_data(self.checkpoint_data_dir)\n            # exit strategy\n            if self.dataset.games_are_good():\n                break\n        print(\"Training finished\")\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/operations/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/operations/checkpoint_op.py",
    "content": "from pathlib import Path\nfrom typing import Any\n\nimport torch\nfrom nebullvm.operations.base import Operation\n\nfrom open_alpha_tensor.config import (\n    BASE_CHECKPOINT_DATA_DIR,\n    BASE_CHECKPOINT_DIR,\n)\nfrom open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel\nfrom open_alpha_tensor.core.training import Trainer\n\n\ndef optimizer_to(optim: torch.optim.Optimizer, device: str):\n    for param in optim.state.values():\n        # Not sure there are any global tensors in the state dict\n        if isinstance(param, torch.Tensor):\n            param.data = param.data.to(device)\n            if param._grad is not None:\n                param._grad.data = param._grad.data.to(device)\n        elif isinstance(param, dict):\n            for subparam in param.values():\n                if isinstance(subparam, torch.Tensor):\n                    subparam.data = subparam.data.to(device)\n                    if subparam._grad is not None:\n                        subparam._grad.data = subparam._grad.data.to(device)\n\n\nclass LoadCheckPointOp(Operation):\n    \"\"\"An operation which loads a checkpoint during training of an\n    OpenAlphaTensor model.\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self._last_epoch = None\n        self._model = None\n        self._optimizer = None\n\n    def execute(\n        self,\n        model: AlphaTensorModel,\n        optimizer: torch.optim.Optimizer,\n        checkpoint_dir: str,\n    ):\n        \"\"\"Load a checkpoint from a directory.\n\n        Args:\n            model: The model to load the checkpoint into.\n            optimizer: The optimizer to load the checkpoint into.\n            checkpoint_dir: The directory to load the checkpoint from.\n        \"\"\"\n        checkpoint_dir = checkpoint_dir or BASE_CHECKPOINT_DIR\n        if (\n            Path(checkpoint_dir).exists()\n            and len(list(Path(checkpoint_dir).glob(\"*.pt\"))) > 0\n        ):\n\n            def key_func(x):\n                return int(x.stem.split(\"_\")[-1])\n\n            checkpoint_path = sorted(\n                Path(checkpoint_dir).glob(\"*.pt\"), key=key_func\n            )[-1]\n            print(f\"Loading checkpoint from {checkpoint_path}\")\n            old_device = model.device\n            checkpoint = torch.load(checkpoint_path)\n            model.load_state_dict(checkpoint[\"model_state_dict\"])\n            model.to(old_device)\n            print(f\"Loaded model to {old_device}\")\n            optimizer.load_state_dict(checkpoint[\"optimizer_state_dict\"])\n            optimizer_to(optimizer, old_device)\n            last_epoch = int(checkpoint_path.stem.split(\"_\")[-1])\n        else:\n            last_epoch = 0\n\n        self._last_epoch = last_epoch\n        self._model = model\n        self._optimizer = optimizer\n\n    def get_last_epoch(self) -> int:\n        \"\"\"Returns the last epoch of the loaded checkpoint.\"\"\"\n        return self._last_epoch\n\n    def get_model(self) -> AlphaTensorModel:\n        \"\"\"Returns the model loaded from the checkpoint.\"\"\"\n        return self._model\n\n    def get_optimizer(self) -> torch.optim.Optimizer:\n        \"\"\"Returns the optimizer loaded from the checkpoint.\"\"\"\n        return self._optimizer\n\n    def get_result(self) -> Any:\n        pass\n\n\nclass LoadCheckpointDataOp(Operation):\n    \"\"\"An operation which loads the games played while training an\n    OpenAlphaTensor model.\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self._loaded = False\n\n    def execute(self, games_store_dir: Path, trainer: Trainer):\n        \"\"\"Load the games played while training an OpenAlphaTensor model.\n\n        Args:\n            games_store_dir: The directory where the games are stored.\n            trainer: The trainer to load the games into.\n        \"\"\"\n        games_store_dir = games_store_dir or BASE_CHECKPOINT_DATA_DIR\n        # if games_store_dir contains games, load them\n        if (\n            games_store_dir.exists()\n            and (games_store_dir / \"game_data.json\").exists()\n        ):\n            trainer.dataset.load_games(games_store_dir)\n        self._loaded = True\n\n    def get_result(self) -> bool:\n        \"\"\"Returns whether the games were loaded or not.\"\"\"\n        return self._loaded\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/operations/model_op.py",
    "content": "import json\nfrom pathlib import Path\nfrom typing import Any\n\nimport torch\nfrom nebullvm.operations.base import Operation\n\nfrom open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel\n\n\nclass BuildModelOp(Operation):\n    \"\"\"An operation which builds an OpenAlphaTensor model.\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self._model = None\n\n    def execute(\n        self,\n        tensor_length: int,\n        input_size: int,\n        scalars_size: int,\n        emb_dim: int,\n        n_steps: int,\n        n_logits: int,\n        n_samples: int,\n    ):\n        \"\"\"Builds the OpenAlphaTensor model.\n\n        Args:\n            tensor_length (int): Number of tensors to as history.\n            input_size (int): Flattened size of the matrices to be multiplied.\n            scalars_size (int): Size of the scalar vectors fed to the torso\n            model.\n            emb_dim (int): Embedding dimension.\n            n_steps (int): Number of steps used to get a single action out of\n            a triplet.\n            n_logits (int): Number of logits output by the policy head.\n            n_samples (int): Number of samples used by the policy head at\n            evaluation time.\n        \"\"\"\n        self._model = AlphaTensorModel(\n            tensor_length=tensor_length,\n            input_size=input_size,\n            scalars_size=scalars_size,\n            emb_dim=emb_dim,\n            n_steps=n_steps,\n            n_logits=n_logits,\n            n_samples=n_samples,\n        )\n\n    def get_model(self) -> AlphaTensorModel:\n        \"\"\"Returns the built model.\"\"\"\n        return self._model\n\n    def get_result(self) -> Any:\n        pass\n\n\nclass BuildOptimizerOp(Operation):\n    \"\"\"An operation which builds an optimizer for an OpenAlphaTensor model.\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self._optimizer = None\n\n    def execute(\n        self,\n        optimizer_name: str,\n        model: AlphaTensorModel,\n        lr: float,\n        weight_decay: float,\n    ):\n        \"\"\"Builds the optimizer for the OpenAlphaTensor model.\n\n        Args:\n            optimizer_name (str): Name of the optimizer used.\n            model (AlphaTensorModel): OpenAlphaTensor model to be trained.\n            lr (float): Learning rate.\n            weight_decay (float): Weight decay used by the optimizer.\n        \"\"\"\n        if optimizer_name == \"adam\":\n            optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n        elif optimizer_name == \"adamw\":\n            optimizer = torch.optim.AdamW(\n                model.parameters(), lr=lr, weight_decay=weight_decay\n            )\n        elif optimizer_name == \"sgd\":\n            optimizer = torch.optim.SGD(model.parameters(), lr=lr)\n        else:\n            raise ValueError(f\"Optimizer {optimizer_name} not supported\")\n        self._optimizer = optimizer\n\n    def get_optimizer(self) -> torch.optim.Optimizer:\n        \"\"\"Returns the built optimizer.\"\"\"\n        return self._optimizer\n\n    def get_result(self) -> Any:\n        pass\n\n\nclass SaveModelOp(Operation):\n    \"\"\"An operation which saves an OpenAlphaTensor model.\n    The model parameters are stored in a json file, while the model weights\n    are stored in a .pt file.\"\"\"\n\n    def get_result(self) -> Any:\n        pass\n\n    def execute(\n        self,\n        model: AlphaTensorModel,\n        save_dir: str,\n    ):\n        \"\"\"Saves the OpenAlphaTensor model.\n\n        Args:\n            model (AlphaTensorModel): OpenAlphaTensor model to be saved.\n            save_dir (str): Directory where the model will be saved.\n        \"\"\"\n        save_dir = Path(save_dir if save_dir else \".\")\n        save_dir.mkdir(parents=True, exist_ok=True)\n        torch.save(model.state_dict(), save_dir / \"final_model.pt\")\n        model_params = {\n            \"input_size\": model.input_size,\n            \"tensor_length\": model.tensor_length,\n            \"scalars_size\": 1,\n            \"emb_dim\": model.emb_dim,\n            \"n_steps\": model.n_steps,\n            \"n_logits\": model.n_logits,\n            \"n_samples\": model.n_samples,\n        }\n        # save parameters in a json file\n        with open(save_dir / \"model_params.json\", \"w\") as f:\n            json.dump(model_params, f)\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/operations/training_op.py",
    "content": "from pathlib import Path\nfrom typing import Tuple, Any, List\n\nimport torch.optim\nfrom nebullvm.operations.base import Operation\n\nfrom open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel\nfrom open_alpha_tensor.core.training import Trainer\nfrom open_alpha_tensor.operations.checkpoint_op import LoadCheckpointDataOp\n\n\nclass TrainingOperation(Operation):\n    \"\"\"Operation which trains an AlphaTensor model to learn more efficient\n    matrix multiplications.\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self._trained_model = None\n\n        self._load_checkpoint_data_op = LoadCheckpointDataOp()\n\n    def execute(\n        self,\n        model: AlphaTensorModel,\n        input_size: int,\n        n_steps: int,\n        batch_size: int,\n        optimizer: torch.optim.Optimizer,\n        device: str,\n        len_data: int,\n        pct_synth: float,\n        n_synth_data: int,\n        limit_rank: int,\n        max_epochs: int,\n        n_actors: int,\n        mc_n_sim: int,\n        N_bar: int,\n        last_epoch: int,\n        lr: float,\n        lr_decay_factor: float,\n        lr_decay_steps: int,\n        loss_params: Tuple[float, float] = None,\n        random_seed: int = None,\n        checkpoint_dir: str = None,\n        checkpoint_data_dir: str = None,\n        n_cob: int = 0,\n        cob_prob: float = 0.0,\n        data_augmentation: bool = False,\n        extra_devices: List[str] = None,\n    ):\n        \"\"\"Trains an AlphaTensor model to learn more efficient matrix\n        multiplications.\n\n        Args:\n            model (AlphaTensorModel): The model to be trained.\n            input_size (int): Flattened size of the matrices to be multiplied.\n            n_steps (int): Number of steps used to get a single action out of\n            a triplet.\n            batch_size (int): Batch size.\n            optimizer (torch.optim.Optimizer): The optimizer used for training.\n            device (str): The name of the torch device used for training.\n            len_data (int): Number of training samples used (both actor\n            generated and synthetic).\n            pct_synth (float): Initial percentage of synthetic samples used\n            for training.\n            n_synth_data (int): Number of synthetic training samples.\n            limit_rank (int): Maximum rank for synthetically-generated\n            matrices.\n            max_epochs (int): Number of training epochs.\n            n_actors (int): Number of actors to play a single each game at\n            each training step.\n            mc_n_sim (int): Number of simulations during Monte Carlo tree\n            search.\n            N_bar (int): N_bar parameter used to compute tau when improving\n            the policy.\n            last_epoch (int): Latest epoch reached during training from which\n            checkpoint data will be loaded.\n            lr (float): Learning rate.\n            lr_decay_factor (float): Learning rate's decay factor.\n            lr_decay_steps (int): Number of learning rate's decay steps.\n            loss_params (Tuple[float, float]): Alpha and Beta parameters used\n            in the loss function.\n            random_seed (int): Randomizing seed.\n            checkpoint_dir (str): Directory used to store model checkpoints.\n            checkpoint_data_dir (str): Directory used to store games as JSON\n            files.\n            n_cob (int): Number of change of basis (cob) used for a single\n            training sample.\n            cob_prob (float): Probability of applying a change of basis.\n            data_augmentation (bool): Whether to randomly swap the last\n            operation of an episode with another operation.\n            extra_devices (List[str]): Extra devices names used for multi-GPU\n            training.\n        \"\"\"\n        checkpoint_data_dir = Path(checkpoint_data_dir or \"games\")\n        # build trainer\n        trainer = Trainer(\n            model=model,\n            tensor_size=input_size,\n            n_steps=n_steps,\n            batch_size=batch_size,\n            optimizer=optimizer,\n            device=device,\n            len_data=len_data,\n            pct_synth=pct_synth,\n            n_synth_data=n_synth_data,\n            limit_rank=limit_rank,\n            loss_params=loss_params,\n            random_seed=random_seed,\n            checkpoint_dir=checkpoint_dir,\n            checkpoint_data_dir=checkpoint_data_dir,\n            data_augmentation=data_augmentation,\n            cob_prob=cob_prob,\n            n_cob=n_cob,\n            extra_devices=extra_devices,\n        )\n\n        # load checkpoint data\n        self._load_checkpoint_data_op.execute(\n            games_store_dir=checkpoint_data_dir,\n            trainer=trainer,\n        )\n\n        # train\n        trainer.train(\n            n_epochs=max_epochs,\n            n_games=n_actors,\n            mc_n_sim=mc_n_sim,\n            N_bar=N_bar,\n            starting_epoch=last_epoch,\n            initial_lr=lr,\n            lr_decay_factor=lr_decay_factor,\n            lr_decay_steps=lr_decay_steps,\n        )\n        self._trained_model = trainer.model\n\n    def get_trained_model(self):\n        \"\"\"Returns the trained model.\"\"\"\n        return self._trained_model\n\n    def get_result(self) -> Any:\n        pass\n"
  },
  {
    "path": "optimization/open_alpha_tensor/open_alpha_tensor/root_op.py",
    "content": "from typing import Tuple, List\n\nfrom nebullvm.operations.base import Operation\n\nfrom open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel\nfrom open_alpha_tensor.operations.checkpoint_op import LoadCheckPointOp\nfrom open_alpha_tensor.operations.model_op import (\n    BuildModelOp,\n    SaveModelOp,\n    BuildOptimizerOp,\n)\nfrom open_alpha_tensor.operations.training_op import TrainingOperation\n\n\nclass TrainAlphaTensorRootOp(Operation):\n    \"\"\"Root operation which trains an AlphaTensor model to learn more\n    efficient matrix multiplications.\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self._model = None\n        self._optimizer = None\n\n        self._build_model_op = BuildModelOp()\n        self._build_optimizer_op = BuildOptimizerOp()\n        self._load_checkpoint_op = LoadCheckPointOp()\n        self._training_op = TrainingOperation()\n        self._save_model_op = SaveModelOp()\n\n    def execute(\n        self,\n        tensor_length: int,\n        input_size: int,\n        scalars_size: int,\n        emb_dim: int,\n        n_steps: int,\n        n_logits: int,\n        n_samples: int,\n        optimizer_name: str,\n        lr: float,\n        lr_decay_factor: float,\n        lr_decay_steps: int,\n        weight_decay: float,\n        loss_params: Tuple[float, float],\n        checkpoint_dir: str,\n        checkpoint_data_dir: str,\n        epochs: int,\n        batch_size: int,\n        len_data: int,\n        n_synth_data: int,\n        pct_synth: float,\n        limit_rank: int,\n        n_actors: int,\n        mc_n_sim: int,\n        N_bar: int,\n        device: str,\n        save_dir: str,\n        random_seed: int,\n        n_cob: int,\n        cob_prob: float,\n        data_augmentation: bool,\n        extra_devices: List[str],\n    ):\n        \"\"\"Trains an AlphaTensor model to learn more efficient matrix\n        multiplications.\n\n        Args:\n            tensor_length (int): Number of step tensors fed to the model\n            (history and current state),\n            input_size (int): Flattened size of the matrices to be multiplied,\n            scalars_size (int): Size of the scalar vectors fed to the torso\n            model,\n            emb_dim (int): Embedding dimension,\n            n_steps (int): Number of steps used to get a single action out of\n            a triplet,\n            n_logits (int): Number of logits output by the policy head,\n            n_samples (int): Number of samples used by the policy head at\n            evaluation time,\n            optimizer_name (str): Name of the optimizer used,\n            lr (float): Learning rate,\n            lr_decay_factor (float): Learning rate's decay factor,\n            lr_decay_steps (int): Number of learning rate's decay steps,\n            weight_decay (float): Weight decay used by the optimizer,\n            loss_params (Tuple[float, float]): Alpha and Beta parameters used\n            in the loss function,\n            checkpoint_dir (str): Directory used to store model checkpoints,\n            checkpoint_data_dir (str): Directory used to store games as JSON\n            files,\n            epochs (int): Number of training epochs,\n            batch_size (int): Batch size,\n            len_data (int): Number of training samples used (both actor\n            generated and synthetic),\n            n_synth_data (int): Number of synthetic training samples,\n            pct_synth (float): Initial percentage of synthetic samples used\n            for training,\n            limit_rank (int): Maximum rank for synthetically-generated\n            matrices,\n            n_actors (int): Number of actors to play a single each game at\n            each training step,\n            mc_n_sim (int): Number of simulations during Monte Carlo tree\n            search,\n            N_bar (int): N_bar parameter used to compute tau when improving\n            the policy,\n            device (str): The name of the torch device used for training,\n            save_dir (str): Directory where the final trained model will be\n            stored,\n            random_seed (int): Randomizing seed,\n            n_cob (int): Number of change of basis (cob) used for a single\n            training sample,\n            cob_prob (float): Probability of applying a change of basis,\n            data_augmentation (bool): Whether to randomly swap the last\n            operation of an episode with another operation,\n            extra_devices (List[str]): Extra devices names used for multi-GPU\n            training.\n        \"\"\"\n        if self._model is None:\n            self._build_model_op.execute(\n                tensor_length=tensor_length,\n                input_size=input_size,\n                scalars_size=scalars_size,\n                emb_dim=emb_dim,\n                n_steps=n_steps,\n                n_logits=n_logits,\n                n_samples=n_samples,\n            )\n            self._model = self._build_model_op.get_model().to(device)\n\n        if self._build_model_op.get_model() is not None:\n            self._build_optimizer_op.execute(\n                optimizer_name=optimizer_name,\n                model=self._build_model_op.get_model(),\n                lr=lr,\n                weight_decay=weight_decay,\n            )\n            self._optimizer = self._build_optimizer_op.get_optimizer()\n\n        if self._model is not None and self._optimizer is not None:\n            self._load_checkpoint_op.execute(\n                self._model, self._optimizer, checkpoint_dir\n            )\n\n        if self._load_checkpoint_op.get_model() is not None:\n            self._model = self._load_checkpoint_op.get_model()\n            self._optimizer = self._load_checkpoint_op.get_optimizer()\n            starting_epoch = self._load_checkpoint_op.get_last_epoch()\n            self._training_op.execute(\n                model=self._model,\n                input_size=input_size,\n                n_steps=n_steps,\n                batch_size=batch_size,\n                optimizer=self._optimizer,\n                device=device,\n                len_data=len_data,\n                pct_synth=pct_synth,\n                n_synth_data=n_synth_data,\n                limit_rank=limit_rank,\n                max_epochs=epochs,\n                n_actors=n_actors,\n                mc_n_sim=mc_n_sim,\n                N_bar=N_bar,\n                last_epoch=starting_epoch,\n                lr=lr,\n                lr_decay_factor=lr_decay_factor,\n                lr_decay_steps=lr_decay_steps,\n                loss_params=loss_params,\n                random_seed=random_seed,\n                checkpoint_dir=checkpoint_dir,\n                checkpoint_data_dir=checkpoint_data_dir,\n                n_cob=n_cob,\n                cob_prob=cob_prob,\n                data_augmentation=data_augmentation,\n                extra_devices=extra_devices,\n            )\n        if self._training_op.get_trained_model() is not None:\n            self._model = self._training_op.get_trained_model()\n            self._save_model_op.execute(\n                model=self._model,\n                save_dir=save_dir,\n            )\n\n    def get_result(self) -> AlphaTensorModel:\n        \"\"\"Returns the trained torch model\"\"\"\n        return self._model\n"
  },
  {
    "path": "optimization/open_alpha_tensor/resources/open_alpha_tensor.md",
    "content": "\n# Open Source Implementation of DeepMind’s AlphaTensor\n\n\n\nMatrix multiplication is a fundamental operation used in many systems, from neural networks to scientific computing routines. Finding efficient and provably correct algorithms for matrix multiplication can have a huge impact on making computation faster and more efficient, but is a very challenging task. The space of possible algorithms is enormous, and traditional methods for discovering algorithms, such as human-designed heuristics or combinatorial search, are often suboptimal.\n\n[DeepMind](https://www.deepmind.com/)'s recently proposed an AI-based solution for automated search that goes far beyond human intuition. The solution consists of a deep reinforcement learning agent called AlphaTensor, built on top of [AlphaZero](https://www.deepmind.com/blog/alphazero-shedding-new-light-on-chess-shogi-and-go). This agent is trained to play a single-player game, TensorGame, where the goal is to discover computationally efficient algorithms for matrix multiplication.\n\nAlphaTensor is particularly good at handling large matrices by decomposing large matrix multiplications into smaller multiplications. Moreover, AlphaTensor can be used to achieve state-of-the-art performance for matrix multiplication once fine-tuned on a specific hardware device.\n\nAlphaTensor has great potential for accelerating deep learning computing. In deep learning, many time-consuming operations can be mapped to matrix multiplications. By using AlphaTensor to optimize these operations, the overall performance of deep learning models can be significantly improved. \n\nIn this article, we will explore DeepMind's AlphaTensor architecture and algorithm and how it discovers new efficient algorithms by playing the TensorGame. Next, we will examine the [first open-source implementation of AlphaTensor](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/open_alpha_tensor), and unresolved challenges to potentially revolutionize the computational performance of deep learning models with AlphaTensors.\n\n![deepmind-4QVqSh4VvP4-unsplash](https://user-images.githubusercontent.com/83510798/221407730-77526b8f-b363-4716-9945-6ccd518632e5.jpg)\n\nPhoto by [DeepMind](https://unsplash.com/@deepmind?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText) on [Unsplash](https://unsplash.com/photos/4QVqSh4VvP4)\n\n# What is DeepMind’s AlphaTensor?\n\nAlphaTensor is a reinforcement learning algorithm based on the AlphaZero algorithm and trained to play a simple one-player game: the TensorGame. This game consists in finding the tensor decomposition of a three-dimensional tensor representing the matrix multiplication.\n\n### Matrix Multiplication Tensor\n\nFor non-experts in Matrix Multiplication optimization, it may not be straightforward to understand how an operation, such as a matrix multiplication, can be mapped in a three-dimensional tensor. I will try to explain it in simple words and with examples.\n\nLet’s consider the product `C = A*B`, where for simplicity both A and B are square matrices of size N. The multiplication operation can be mapped in a 3D tensor of shape `(N^2, N^2, N^2)` . The first tensor dimension represents the flatten matrix A, the second dimension the flatten matrix B and the third dimension the flatten matrix C.\n\nThe tensor has only binary values (either 1 or 0) for each entry. Note that the tensor represents the multiplication operation, so it is independent of the values of the matrices A and B.\n\nEvery entry of the tensor corresponds to the coefficient of the operation. For example, to compute C[1,1], it is necessary to multiply both A[1,1] and B[1,1]. Therefore, the tensor entry [0,0,0], which corresponds to A[1,1], B[1,1] and C[1,1], will have value 1. In contrast, to compute C[1,1], A[2,1] is not needed. Thus, the tensor row T[N+1, :, 0] will contain only zeros.\n\nThe image below from [DeepMind’s paper](https://www.marktechpost.com/2023/02/20/a-new-ai-approach-using-embedding-recycling-er-can-make-language-model-development-more-efficient-with-2x-faster-training-and-1-8x-speedup-in-inference/) shows an example of a tensor for N=2.\n\n<img width=\"972\" alt=\"Screen Shot 2023-02-26 at 12 33 26 PM\" src=\"https://user-images.githubusercontent.com/83510798/221408016-9228ec6e-1cd6-44f7-a34c-45ad293989fe.png\">\n\nAs shown in (b) and (c) in the figure above, it is possible to implement an algorithm for computing the product using a decomposition of the 3D tensor. More specifically, the algorithm below can be used for converting a tensor decomposition (the matrices U, V, W) in a matrix multiplication algorithm.\n\n<img width=\"637\" alt=\"Screen Shot 2023-02-26 at 1 36 10 PM\" src=\"https://user-images.githubusercontent.com/83510798/221410847-74a7a115-4de6-42d6-9969-51124c2e986b.png\">\n\n## The TensorGame\n\nThe problem of finding efficient algorithms for matrix multiplication is extremely challenging because the number of possible algorithms to consider is much larger than the number of atoms in the universe, even for small instances of matrix multiplication. \n\nDeepMind converted this problem into a single-player game, and called it the TensorGame. In this game, the player chooses how to combine different entries of matrices to multiply them. A score is assigned based on the number of operations required to achieve the correct multiplication result. The game ends when the zero tensor is reached or when the maximum number of moves has been made. The final factorization is evaluated based on an estimation of the residual rank and certain optimization criteria, such as asymptotic time complexity or practical runtime.\n\nThe initial position in the TensorGame corresponds to the Matrix Multiplication Tensor expressed on some random basis.\n\nIn each step t **of the game, the player writes down three vectors $\\vec{u}(t), \\vec{v}(t), \\vec{w}(t)$, which specifies the rank-1 tensors $\\vec{u} \\otimes \\vec{v} \\otimes \\vec{w}$. The state of the game is updated by subtracting the vectors selected by the player:\n\n$$\n\\tilde{S}_{t+1} = \\tilde{S}_{t} - \\vec{u} \\otimes \\vec{v} \\otimes \\vec{w}\n$$\n\nwhere $\\tilde{S}_0$ is the Matrix Multiplication Tensor.\n\nIf the game ends in p steps, this means that the Matrix Multiplication Tensor $\\tilde S_0$ can be decomposed into p rank-1 tensors $\\vec{u} \\otimes \\vec{v} \\otimes \\vec{w}$, i.e. it has at least rank p.\n\nThe TensorGame can then be interpreted as a rank decomposition algorithm and AlphaTensor can be seen as an algorithm for estimating the rank of the tensor.\n\n## AlphaTensor Architecture\n\nSo far we have learned about the TensorGame and clarified how its solution can be seen as a matrix multiplication algorithm. Let’s now explore the main concepts of AlphaTensor, the algorithm used for the game.\n\nAlphaTensor architecture is basically an encoder-decoder Transformer architecture where: \n\n- the encoder takes as input the game state $\\tilde S_t$, the n previous actions taken by the model (usually n=7) and the time index t **of the current action. Information is stacked together in a tensor with shape `(n+1, N^2, N^2, N^2)` . This tensor is then reshaped and transformed (using three linear layers) in a tensor of shape `(N^2, N^2, c)` where c is the inner dimension of the model.\n- the decoder generates the `n_steps` actions from the embedded vector given by the encoder in an auto-regressive way.  Each action corresponds to a token of the triplets $(\\vec{u}, \\vec{v}, \\vec{w})$ representing one of the triplets decomposing the game tensor (i.e. reducing its rank)\n\nThe model is trained by alternating back-propagation and model acting. Model acting is used to generate data that is then used to train the model. In practice, the model is trained with a mixture of synthetically generated data and data generated by the model during acting. The acting step is done by taking a 3D tensor corresponding to a matrix operation and playing `n_actors` games on it. Each actor plays a game either on the standard basis or on an alternative basis (the change of basis is applied with a given probability). The results are then collected and can be used in the training step with the synthetic data.\n\nThe acting step is based on AlphaZero's Monte Carlo Tree Search (MCTS), modified to support large action spaces. In short, before choosing the action, `n_sims` paths are explored from the model output with a maximum future exploration of 5 steps. The probabilities generated by the model are then adjusted taking into account the generated paths. Then the action with the most promising future path(s) is chosen to continue the game.\n\nWhile training the model, the reward is actually a negative reward (penalty). Its absolute value increases with each additional step required to solve the game. If the model takes `m` steps to solve a TensorGame, the reward associated with the game is `r=-m.` If the model is not able to solve the TensorGame in `max_rank` steps, the reward is computed by estimating the rank of the remaining tensor. The rank is estimated as the sum of the ranks of the matrices that compose the tensor. The estimate is an upper bound on the true rank of the tensor.\n\nWhen fine-tuning the model, the penalty reward at the terminal state should also take into account the latency of the algorithm produced by the model.  The reward formula becomes `rt'=rt+λbt`, where `rt` is the reward scheme described earlier, `bt` is the benchmark reward (non-zero only at the terminal state), and *`λ`* is a user-specified coefficient.\n\n<img width=\"1347\" alt=\"Screen Shot 2023-02-26 at 1 37 12 PM\" src=\"https://user-images.githubusercontent.com/83510798/221410915-7c57c029-e181-4030-8fb3-f4bd544f6beb.png\">\n\nThe image above from DeepMind's paper shows the speed-ups (%) of AlphaTensor-discovered algorithms tailored for a GPU and a TPU, extracted from DeepMind’s paper. Speed-ups are measured relative to standard (e.g. cuBLAS for the GPU) matrix multiplication on the same hardware and compared to the Strassen-square algorithm.\n\n# The Open Source Implementation of DeepMind’s AlphaTensor\n\n[OpenAlphaTensor](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/open_alpha_tensor) is the first open source implementation of AlphaTensor and was developed by [Diego Fiori](https://www.linkedin.com/in/diego-fiori-/). \n\nLet's discover more about the implementation.\n\nAs we discussed earlier, the AlphaTensor architecture is fairly straightforward, based on a standard transformer with an encoder-decoder architecture. The most interesting components of AlphaTensor are the first layer in the encoder part and the way the actions are sampled.\n\nLet’s start with the first encoding layer.\n\n```python\n# x.size = (N, T, S, S, S)\n# scalars.size = (N, s)\nbatch_size = x.shape[0]\nS = x.shape[-1]\nT = x.shape[1]\nx1 = x.permute(0, 2, 3, 4, 1).reshape(batch_size, S, S, S * T)\nx2 = x.permute(0, 4, 2, 3, 1).reshape(batch_size, S, S, S * T)\nx3 = x.permute(0, 3, 4, 2, 1).reshape(batch_size, S, S, S * T)\ninput_list = [x1, x2, x3]\nfor i in range(3):\n    temp = self.linears_1[i](scalars).reshape(batch_size, S, S, 1)\n    input_list[i] = torch.cat([input_list[i], temp], dim=-1)\n    input_list[i] = self.linears_2[i](input_list[i])\nx1, x2, x3 = input_list\n```\n\nIn the snippet above, we show how the input tensor is decomposed into three tensors, which are then used as query, key and value inputs of the transformer-layer.\n\n1. Across the three tensor dimensions representing the flattened matrices (A, B, C), the input tensor is flattened along each dimension together with the dimension representing the previous actions. In this way, in each flattened-copy of the input tensor, the selected dimension is an aggregation of the last T-1 values and the actual value, for all the S values of the selected dimension, where S=N^2. Philosophically, it is as if, for each dimension, we focus on what happened in the previous actions in that dimension.\n2. The scalars are mapped in three different spaces of dimension S^2, and then reshaped to be concatenated with the tensors obtained at the previous point. Conceptually, the scalars are mapped to an embedding space of dimension S^2, and then the embedded information is chunked into S vectors and stacked together, similar to what happens to text when tokenized.\n3. Scalar tokens are concatenated with the restructured input tensor and then given as input to a linear layer for mapping the scalars+channel-history focus information in the internal dimension of the model.\n\nThese three steps can be interpreted as a way of giving to the model both information about the scalars (as in the TensorGame time step) and the focus on the previous actions for each channel.\n\nRegarding the way the actions are produced, it is interesting to note that AlphaTensor generates as output the triplet u, v, w, which aims to reduce the tensor rank. The three vectors have size S and since they are concatenated the model has to produce a vector of size 3*S.  AlphaTensor is trained with a RL algorithm, so all possible actions must be expressed in terms of probabilities in an enumerated space, i.e. the model produces a probability over the different actions. This means that each vector in the 3S space should be mapped to a different action. This results in an action space of size |F|^(3S), where |F| is the number of different values that the element of u, v, w can take. Usually the values are restricted to (-2, -1, 0, 1, 2), resulting in a cardinality of 5 elements.\n\nHere comes a major challenge: to generate the action probabilities for a matrix product of matrices of size 5 we would need a memory of 5^75 * 4 bytes, which would mean `~10^44 GB` of memory. Clearly we cannot manage such a large action space. \n\nHow do we solve the problem? To reduce the memory footprint of the action probabilities we can split the triplets into smaller chunks, “tokenize” them, and threaten the chunks as generated tokens in the transformer architecture, i.e. the tokens are given as input to the decoder in an auto-regressive way.  In the example above we can split the triplets into 15 chunks, reducing the memory consumption to `15 * 5^(75/15) * 4`, i.e. `187.5 KB`.\n\n```python\ndef _eval_forward(self, e: torch.Tensor):\n    bs = e.shape[0]\n    future_g = (\n        torch.zeros((bs, self.n_samples, self.n_steps)).long().to(e.device)\n    )\n    ps = torch.ones((bs, self.n_samples)).to(e.device)\n    e = e.unsqueeze(1).repeat(1, self.n_samples, 1, 1)\n\n    future_g = future_g.view(-1, self.n_steps)\n    ps = ps.view(-1)\n    e = e.view(-1, e.shape[-2], e.shape[-1])\n    for i in range(self.n_steps):\n        o_s, z_s = self.core(future_g[:, : i + 1], e)\n        future_g[:, i], p_i = sample_from_logits(o_s[:, i])\n        ps *= p_i\n    future_g = future_g.view(bs, self.n_samples, self.n_steps)\n    ps = ps.view(bs, self.n_samples)\n    return (\n        future_g,\n        ps,\n        z_s[:, 0].view(bs, self.n_samples, *z_s.shape[2:]).mean(1),\n    )\n\n```\n\nAbove we show the code snippet for generating the full action. In the code, `self.core` contains the decoder layer and the tensor `e` represents the output of the encoder layer. Zero can be considered as the `<eos>` token in NLP models and the `n_steps` actions representing the `n_steps` chunks are generated in a progressive way. \n\nThe model returns three quantities:\n\n1. The generated actions\n2. The probability associated with the full action\n3. The logits produced for generating the first action (the first chunk) that will be used for computing the model value.\n\nIt is worth spending a few words on the `n_samples` parameter. The parameter is used for the acting step and it allows the model to generate different versions of the triplets which will then be used for exploring the action space in the Monte Carlo Tree Search algorithm used in the Acting process. The `n_samples` different actions are sampled accordingly to the policy generated by the model.\n\n## Acting Step\n\nThe most tricky part of the whole algorithm is probably the Acting step used for solving the TensorGame. The algorithm is not deeply explained in the AlphaTensor paper, since it is based on several DeepMind’s previous papers which are just cited and given as known. Here, I’ll re-compose all the missing pieces and explain step by step our implementation. \n\nWe can organize the acting steps in three different components:\n\n- The Monte-Carlo Tree Search\n- The game simulation\n- The Improved policy computation\n\n### Monte-Carlo Tree Search (MCTS)\n\nMonte Carlo Tree Search (MCTS) is a widely used artificial intelligence technique for game playing, particularly in board games and video games. The algorithm creates a game tree that simulates potential moves and outcomes and uses random sampling to evaluate the expected reward for each move. The algorithm then repeatedly selects the move with the highest expected reward and continues simulating outcomes until it reaches a terminal state or a specified stopping condition. The simulations are used to estimate the probability of winning for each move and guide the decision-making process. MCTS has been shown to be effective in complex games where the number of possible moves and outcomes is large, and it has been used in successful game-playing AI systems, such as AlphaGo.\n\nIn AlphaTensor a modified version of the original MCTS is used. In particular, instead of randomly selecting the action from the whole action space, the action is selected among a subset generated directly by the model (through the `n_samples` presented before). The correction to the policy upgrade is then applied in the **Improved Policy computation** step.\n\nIn our implementation, we decided to keep all the information about the Monte-Carlo tree in a dictionary having as key the hash-version of the TensorGame state and as values the information associated with the state itself. Each Monte-Carlo step starts from a node and simulate `n_sim` mini-games, exploring the future with a horizon of 5 moves. If the node has already been explored in previous simulations, n_sim is adjusted considering the number of previous exploration. For each node the number of visits is stored in the `N_s_a` tensor, since this tensor contains the number of visits per node child action (among the ones sampled by the model).\n\n```python\ndef monte_carlo_tree_search(\n    model: torch.nn.Module,\n    state: torch.Tensor,\n    n_sim: int,\n    t_time: int,\n    n_steps: int,\n    game_tree: Dict,\n    state_dict: Dict,\n):\n\"\"\"Runs the monte carlo tree search algorithm.\n\n    Args:\n        model (torch.nn.Module): The model to use for the simulation.\n        state (torch.Tensor): The initial state.\n        n_sim (int): The number of simulations to run.\n        t_time (int): The current time step.\n        n_steps (int): The maximum number of steps to simulate.\n        game_tree (Dict): The game tree.\n        state_dict (Dict): The dictionary containing the states.\n    \"\"\"\n    state_hash = to_hash(extract_present_state(state))\n    if state_hash in state_dict:\n        with torch.no_grad():\n            N_s_a = state_dict[state_hash][3]\n            n_sim -= int(N_s_a.sum())\n            n_sim = max(n_sim, 0)\n\n    for _ in range(n_sim):\n        simulate_game(model, state, t_time, n_steps, game_tree, state_dict)\n    # return next state\n    possible_states_dict, _, repetitions, N_s_a, q_values, _ = state_dict[\n        state_hash\n    ]\n    possible_states = _recompose_possible_states(possible_states_dict)\n    next_state_idx = select_future_state(\n        possible_states, q_values, N_s_a, repetitions, return_idx=True\n    )\n    next_state = possible_states[next_state_idx]\n    return next_state\n\n```\n\nThe code above shows our implementation of the algorithm. For a matter of code simplicity the policy correction is performed in the `simulate_game` function. \n\n### Game Simulation\n\nThe `simulate_game` function is responsible for exploring the tree composed of nodes representing a particular state of the TensorGame. It also runs the model whenever a leaf node is encountered and it stores all node information in the `state_dict` dictionary. Let’s give a deep look at its implementation:\n\n```python\n@torch.no_grad()\ndef simulate_game(\n    model,\n    state: torch.Tensor,\n    t_time: int,\n    max_steps: int,\n    game_tree: Dict,\n    states_dict: Dict,\n    horizon: int = 5,\n):\n\"\"\"Simulates a game from a given state.\n\n  Args:\n      model: The model to use for the simulation.\n      state (torch.Tensor): The initial state.\n      t_time (int): The current time step.\n      max_steps (int): The maximum number of steps to simulate.\n      game_tree (Dict): The game tree.\n      states_dict (Dict): The states dictionary.\n      horizon (int): The horizon to use for the simulation.\n  \"\"\"\n\tidx = t_time\n  max_steps = min(max_steps, t_time + horizon)\n  state_hash = to_hash(extract_present_state(state))\n  trajectory = []\n  # selection\n  while state_hash in game_tree:\n      (\n          possible_states_dict,\n          old_idx_to_new_idx,\n          repetition_map,\n          N_s_a,\n          q_values,\n          actions,\n      ) = states_dict[state_hash]\n      possible_states = _recompose_possible_states(possible_states_dict)\n      state_idx = select_future_state(\n          possible_states, q_values, N_s_a, repetition_map, return_idx=True\n      )\n      trajectory.append((state_hash, state_idx))  # state_hash, action_idx\n      future_state = extract_present_state(possible_states[state_idx])\n      state = possible_states[state_idx]\n      state_hash = to_hash(future_state)\n      idx += 1\n\n  # expansion\n  if idx <= max_steps:\n      trajectory.append((state_hash, None))\n      if not game_is_finished(extract_present_state(state)):\n          state = state.to(model.device)\n          scalars = get_scalars(state, idx).to(state.device)\n          actions, probs, q_values = model(state, scalars)\n          (\n              possible_states,\n              cloned_idx_to_idx,\n              repetitions,\n              not_dupl_indexes,\n          ) = extract_children_states_from_actions(\n              state,\n              actions,\n          )\n          not_dupl_actions = actions[:, not_dupl_indexes].to(\"cpu\")\n          not_dupl_q_values = torch.zeros(not_dupl_actions.shape[:-1]).to(\n              \"cpu\"\n          )\n          N_s_a = torch.zeros_like(not_dupl_q_values).to(\"cpu\")\n          present_state = extract_present_state(state)\n          states_dict[to_hash(present_state)] = (\n              _reduce_memory_consumption_before_storing(possible_states),\n              cloned_idx_to_idx,\n              repetitions,\n              N_s_a,\n              not_dupl_q_values,\n              not_dupl_actions,\n          )\n          game_tree[to_hash(present_state)] = [\n              to_hash(extract_present_state(fut_state))\n              for fut_state in possible_states\n          ]\n          leaf_q_value = q_values\n  else:\n      leaf_q_value = -int(torch.linalg.matrix_rank(state).sum())\n  # backup\n  backward_pass(trajectory, states_dict, leaf_q_value=leaf_q_value)\n```\n\nEach simulation is divided in three parts:\n\n- Selection\n- Expansion\n- Backup\n\nIn the `selection` part the simulation is run on the already generated tree-nodes, and the following node is selected using the following function:\n\n```python\ndef select_future_state(\n    possible_states: List[torch.Tensor],\n    q_values: torch.Tensor,\n    N_s_a: torch.Tensor,\n    repetitions: Dict[int, list],\n    c_1: float = 1.25,\n    c_2: float = 19652,\n    return_idx: bool = False,\n) -> torch.Tensor:\n\"\"\"Select the future state maximizing the upper confidence bound.\"\"\"\n# q_values (1, K, 1)\n    pi = torch.tensor(\n        [\n            len(repetitions[i])\n            for i in range(len(possible_states))\n            if i in repetitions\n        ]\n    ).to(q_values.device)\n    ucb = q_values.reshape(-1) + pi * torch.sqrt(\n        torch.sum(N_s_a) / (1 + N_s_a)\n    ) * (c_1 + torch.log((torch.sum(N_s_a) + c_2 + 1) / c_2))\n    if return_idx:\n        return ucb.argmax()\n    return possible_states[ucb.argmax()]\n```\n\nIn practice, the action maximizing the `ucb` function\n\n$$\nQ(a,s) + \\pi(a,s) * \\sqrt{\\frac{\\sum_i{N(s, a_i)}}{1+N(s,a)}} * \\left[c_1 + \\log\\left(\\frac{1+c_2+\\sum_i{N(s, a_i)}}{c_2}\\right)\\right]\n$$\n\nfor the given state is selected. Where Q represents the Q values generated by the model and π represents the random distribution over the actions sampled using the model policy. `N(s, a)` represents the number of visits of the node to action a from node s.\n\nOnce the selection phase reaches a leaf node, if the simulation has not reached a terminal condition (in terms of either maximum exploration, i.e. future horizon, or game ending), the model is then used for selecting `n_samples` alternative nodes (they will be leaf nodes in the successive iteration). This is called the `expansion` phase, since new nodes are added to the tree. Then, no further node is explored in the current simulation, but the leaf q_value is sent to the following simulation step: the `backup`.\n\nBackup is the final stage of each simulation. During backup, if the leaf node was a terminal state the final reward is computed else the leaf q value is used as an estimated reward. Then the reward is back-propagated on the simulation trajectory updating both the states q_values and updating the visit counter `N(s, a)`. In the snippet below we show the code for the reward back-propagation.\n\n```python\ndef backward_pass(trajectory, states_dict, leaf_q_value: torch.Tensor):\n\"\"\"Backward pass of the montecarlo algorithm\"\"\"\nreward = 0\n    for idx, (state, action_idx) in enumerate(reversed(trajectory)):\n        if action_idx is None:  # leaf node\n            reward += leaf_q_value\n        else:\n            (\n                _,\n                old_idx_to_new_idx,\n                _,\n                N_s_a,\n                q_values,\n                _,\n            ) = states_dict[state]\n            if isinstance(reward, torch.Tensor):\n                reward = reward.to(q_values.device)\n            action_idx = int(action_idx)\n            if action_idx in old_idx_to_new_idx:\n                not_dupl_index = old_idx_to_new_idx[int(action_idx)]\n            else:\n                not_dupl_index = action_idx\n            reward -= 1\n            q_values[:, not_dupl_index] = (\n                N_s_a[:, not_dupl_index] * q_values[:, not_dupl_index] + reward\n            ) / (N_s_a[:, not_dupl_index] + 1)\n            N_s_a[:, not_dupl_index] += 1\n```\n\n### Improved Policy Computation\n\nOnce all the simulations have been run and the MCTS offers an interesting snapshot of the near future it is time to update the policy associated with the predicted nodes and return them, so that they can be used during training. The improved policy, following the method described in [Hubert et al](https://arxiv.org/pdf/2104.06303.pdf), is used for managing large action spaces. In fact, for small search space it is possible during MCTS to sample an action randomly from the action space and evaluate its impact. A similar approach in a much larger action space would lead to all trajectories to diverge in different paths and it would need an infinite amount of trajectories for getting meaningful statistics and then update the policy. Since here we are using sample-MCTS for avoiding the dispersion, i.e. `n_samples` actions are sampled accordingly to the model policy and then MCTS just selects one of the sampled actions while exploring the tree, we need to take into account the sample-correction when computing the final updated policy that will be used while training the model.\n\nIn practice the improved policy is computed as\n\n$$\nI\\pi\\left(s, a\\right) = \\frac{N^{1/\\tau(s)}(s, a)}{\\sum_iN^{1/\\tau(s)}(s, a_i)}\n$$\n\nwhere $\\tau(s) = \\frac{\\log\\left(\\sum_iN(s, a_i)\\right)}{\\log\\left(\\bar{N}\\right)}$ if $\\sum_iN(s, a_i) > \\bar{N}$ else $\\tau(s) = 1$.\n\n```python\ndef compute_improved_policy(\n    state_dict: Dict,\n    states: List[str],\n    model_n_steps: int,\n    model_n_logits: int,\n    N_bar: int,\n):\n\t\t\"\"\"Compute the improved policy given the state_dict, the list of states.\n    The improved policy is computed as (N_s_a / N_s_a.sum())ˆ(1/tau) where tau\n    is (log(N_s_a.sum()) / log(N_bar)) if N_s_a.sum() > N_bar else 1.\n    \"\"\"\n\t\tpolicies = torch.zeros(len(states), model_n_steps, model_n_logits)\n    N_bar = torch.tensor(N_bar)\n    for idx, state in enumerate(states):\n        N_s_a = state_dict[state][3]\n        actions = state_dict[state][5]\n        if N_s_a.sum() > N_bar:\n            tau = (torch.log(N_s_a.sum()) / torch.log(N_bar)).item()\n        else:\n            tau = 1\n\t\t\t\tN_s_a = N_s_a ** (1 / tau)\n        improved_policy = N_s_a / N_s_a.sum()\n        for sample_id in range(actions.shape[1]):\n            action_ids = actions[0, sample_id]\n            for step_id, action_id in enumerate(action_ids):\n                policies[idx, step_id, action_id] += improved_policy[\n                    0, sample_id\n                ]\n    return policies\n```\n\nNote that in our implementation after having computed the policy from the `N_s_a` tensor we have to map it back to the original action tensor. In fact `N_s_a` just considers the actions sampled by the model, while the final policy must contain probabilities also for the not-explored actions.\n\n### Differences respect to ChatGPT training algorithm\n\nAlphaTensor is the latest member of the AlphaGo/AlphaZero family of artificial intelligence methods by DeepMind. These methods are based on the Monte Carlo Tree Search (MCTS) algorithm, which has been refined and enhanced by DeepMind to tackle increasingly complex tasks. Another AI system, OpenAI's ChatGPT, which has caused a lot of buzz for its remarkable performance, was trained with a different approach, called Reinforcement Learning with Human Feedback (RLHF).\n\nRLHF is a fine-tuning technique used to tune language models to follow a set of written instructions. It uses human preferences as a reward signal to fine-tune the model, thereby aligning the behavior of the language model with the stated preferences of a specific group of people, rather than some broader notion of ‘human values’.\n\nIn contrast, MCTS is a tree-based search algorithm used to determine the optimal moves in games. It simulates potential moves and updates the values of each move based on their outcomes, guiding the selection of the best move.\n\nRLHF collects data from human-written demonstrations and human-labelled comparisons between AI models, and trains a reward model to predict the preferences of a given group of people. The reward model is then used to fine-tune the AI models. MCTS, on the other hand, uses simulations and evaluations to determine the best decision.\n\nAlthough they are different approaches, RLHF and MCTS also have similarities. Both artificial intelligence techniques use decision-making and problem-solving methods, and both use a trial-and-error approach to explore different options and make decisions based on available information. Both are also iterative processes that improve over time as more information and experience are gathered.\n\nThe choice between RLHF and MCTS depends on the task at hand. RLHF is ideal when there is no clear metric for evaluating the model performance, while MCTS has proven effective in game-like tasks where knowledge and exploration of the future give the model a significant advantage.\n\n## Code Optimization for AlphaTensor training\n\nImplementing the AlphaTensor training algorithm requires finding the perfect compromise between training speed and memory consumption. As seen in the Model section, simply considering the action tokenization can save a lot of memory, but an overly aggressive action space reduction can lead to both drop in accuracy and slower performance. The latter happens because all tokens are generated sequentially in an autoregressive way by the model decoder. Therefore, the inference time grows linearly with the number of tokens per action once the softmax on the action space is not the bottleneck anymore.\n\nWhen setting up AlphaTensor training, the main difficulties were found in dealing with the acting process. If the tensors are not stored in the correct format, the MCTS can easily cause uncontrolled memory usage growth. On the other hand, if the number of tensors stored during each simulation is reduced too much, the MCTS can spend an infinite amount of time re-computing the required states.\n\nLet's take an example of the game simulation step, where the game is explored by looking at possible future scenarios. For each state, if we don't save the actions generated by the model and we decide to save only the random seed used to sample the actions from the policy, then each time we explore a tree node we would have to recompute the policy and then sample the actions. Clearly, we decided to store the sampled actions to save time and to avoid having to manage model sharing between different processes in the case of MCTS exploration parallelization.\nHowever, just saving the actions was not enough to get a sufficiently efficient acting step. In fact, the time for converting the n_steps actions into the (u, v, w) triplet, reducing the game tensor state and creating the new3D tensors from the n_samples actions would easily be a bottleneck for the whole training.\nSecondly, we didn't want to store all possible future states for each sampled action, as this would have a huge impact on the memory used by the algorithm. Suppose we set n_samples=32, n=7 and N=5, and let's remember that N is the size of the square matrix product we want to reduce and n is the number of previous actions remembered by the model. In this situation, each state tensor would have the form (8, 25, 25, 25), which multiplied by 32 would result in 32*8*25*25*25*4 bytes for each node in the graph. Now, considering that each simulation in the expansion phase generates a new node (and n_sim=200), we would have a final memory consumption of 200*32*8*25*25*25*4 = 3.2GB for the first MCTS node alone. In the worst case scenario, while exploring acting max_rank nodes (where `max_rank=150`), this would result in a total memory consumption of 150 * 3.2GB = 480GB in RAM memory (or GPU memory if all tensors were stored on the GPU). We ran the training on our workstation with 128 GB of RAM and 48 GB of GPU memory, so we had to reduce the memory consumption.\n\nSince we didn't want to increase the execution time, we adopted an optimization that exploits the redundancy in the state tensors produced. In fact, the tensors have n-1 previous actions in common, which can then be stored once and not repeated for each stored tensor. This results in a memory reduction of 2/7~28%, meaning that in the worst case 137GB can be stored. At this point, by simply pruning the unused part of the tree (such as the unselected trajectories) and storing the tensors in CPU memory, we were able to avoid any memory error during training.\n\n# Next Steps\n\nWith AlphaTensor now being open source, several exciting avenues for further development open up.\n\nA natural next step is to fine-tune AlphaTensor on specific hardware devices and benchmark performance. At the time of writing, fine-tuning was in progress.\n\nAnother important advance would be the support for remote compilation, allowing users to build algorithms optimized for edge devices. This can be achieved by storing the AlphaTensor model on a server, while the matrix multiplication algorithm is evaluated on different hardware.\n\nIt could also be important to extend support for different compilers to compute the latency-based reward correction. Different compilers can lead to different optimized algorithms on a given hardware. For example, the DeepMind paper showed promising results using JAX and the XLA compiler on TPU and Nvidia GPUs. It would be interesting to evaluate this using NCCL on Nvidia or llvm on CPUs.\n\nFinally, extending the model and training algorithm to support larger matrix sizes remains a major open challenge. Currently, AlphaTensor supports a maximum matrix size of 5, but it can be applied by splitting larger matrix multiplications into groups of tiny MMs with a size smaller than 5. This approach is suboptimal, and performing the reduction directly on the large tensor corresponding to the full MM could theoretically lead to better results.\n\n## Speedster integration of AlphaTensor\n\nAlphaTensor opens the doors for further improvements to Speedster. [Speedster](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster) is an open source module designed to speed up AI inference with just a few lines of code. The library automatically applies the best set of SOTA optimization techniques to achieve maximum inference speed-up.\n\nWithin Speedster, AlphaTensor will use its optimized kernels for matrix multiplication to find the optimal set of sub-operations for each layer in the AI model that involve matrix multiplication, including linear layers, attention layers, and convolution layers. The matrix multiplications will be decomposed into sub-matrix multiplications up to the maximum size supported by AlphaTensor, and the fastest decomposition will be selected for each layer. This optimization process will be applied to all layers in the neural network, resulting in a dramatically improved model.\n\nWe expect to see significant speed-ups especially in transformer models, where large matrix multiplications become the computational bottleneck at larger sizes. We also plan to support AlphaTensor algorithm generation for reduced precision formats, such as fp16 and int8, in addition to fp32.\n"
  },
  {
    "path": "optimization/open_alpha_tensor/setup.py",
    "content": "from pathlib import Path\nfrom setuptools import setup, find_packages\n\n\nREQUIREMENTS = [\n    \"nebullvm\",\n    \"torch\",\n    \"tqdm\",\n]\n\nthis_directory = Path(__file__).parent\nlong_description = (this_directory / \"README.md\").read_text(encoding=\"utf8\")\n\nsetup(\n    name=\"OpenAlphaTensor\",\n    version=\"0.0.1\",\n    packages=find_packages(),\n    install_requires=REQUIREMENTS,\n    long_description=long_description,\n    include_package_data=True,\n    long_description_content_type=\"text/markdown\",\n)\n"
  },
  {
    "path": "optimization/optimate/README.md",
    "content": "# 🧉 OptiMate (WIP)\nInteractive tool guiding savvy users in achieving the best inference performance out of a given model / hardware setup.\n\nIf you like this library, give us a star to show your support for the project ⭐\n\n## 📖 Description\nThe OptiMate module is targeted at a sophisticated and savvy type of users, who need to squeeze out every last drop of performance out of a given hardware. \n\nThe module is designed to help users to optimize their deep-learning models through the use of profilers and advanced optimization techniques. It also includes a smart assistant that guides the user through the optimization process and provides suggestions to improve the performance of the model. \n\nEach temporary optimization is tracked in a detailed version history, allowing the user to revert to its preferred version at the end of the optimization process.\n\nFirst, the module leverages profilers to gather information about the model, such as the amount of time it takes for the model to make predictions and the amount of memory used. This information helps in identifying bottlenecks and other inefficiencies in the model.\n\nThen, the module uses various optimization techniques to improve inference performances. These  techniques include, among others, model compression, pruning, and quantization, which can help reduce the size and computational demand of the model.\n\nThroughout the process, the smart assistant provides guidance and suggestions to the user. For example, it might suggest which optimization techniques to try out or provide guidance on how to adjust the model parameters to improve its performance.\n\nOverall, the module provides a user-friendly but sophisticated interface to get the most out of any model / hardware setup. Try it out today, and reach out if you have any feedback!\n"
  },
  {
    "path": "optimization/speedster/README.md",
    "content": "# 💥 Speedster\n\n`Speedster` reduces inference costs by leveraging SOTA optimization techniques that best couple your AI models with the underlying hardware (GPUs and CPUs). The idea is to make AI inference way cheaper in just a few lines of code.\n\n`Speedster` makes it easy to combine optimization techniques across the whole software-to-hardware stack, delivering best-in-class cost savings. If you like the idea, give us a star to support the project ⭐\n\n![speedster](https://user-images.githubusercontent.com/53374883/225599469-f1a626f0-c001-42bd-bc8b-ec0e966ddad6.png)\n\nThe core `Speedster` workflow consists of 3 steps:\n\n- [x]  **Select**: input your model in your preferred DL framework and express your preferences regarding:\n    - Accuracy loss: do you want to trade off a little accuracy for significant cost savings?\n    - Optimization time: achieving great savings can be time-consuming. Can you wait, or do you need an instant answer?\n- [x]  **Search**: the library automatically tests every combination of optimization techniques across the software-to-hardware stack (sparsity, quantization, compilers, etc.) that is compatible with your needs and local hardware.\n- [x]  **Serve**: finally, `Speedster` chooses the best configuration of optimization techniques and returns an accelerated version of your model in the DL framework of your choice (just cheaper 🚀).\n\n# Installation\n\nInstall `Speedster` and its base requirements:\n```\npip install speedster\n```\n\nThen make sure to install all the available deep learning compilers.\n```\npython -m nebullvm.installers.auto_installer --compilers all\n```\n> :warning: For **MacOS** with **ARM processors**, please use a conda environment.\n> Moreover, if you want to optimize a **PyTorch model**, PyTorch must be pre-installed \n> on your environment before proceeding to the next step, please install it from this \n> [link](https://pytorch.org/get-started/locally/).\n\nFor more details on how to install Speedster, please visit our [Installation](https://docs.nebuly.com/Speedster/installation/) guide.\n\n# Quick start\n\nOnly one line of code - that’s what you need to accelerate your model! Find below your getting started guide for 5 different input model frameworks:\n\n<details>\n<summary>🔥 PyTorch </summary>\n    \nIn this section, we will learn about the 4 main steps needed to optimize PyTorch models:\n\n1) Input your model and data\n2) Run the optimization\n3) Save your optimized model \n4) Load and run your optimized model in production\n\n```python\nimport torch\nimport torchvision.models as models\nfrom speedster import optimize_model, save_model\n\n#1 Provide input model and data (we support PyTorch Dataloaders and custom input, see the docs to learn more)\nmodel = models.resnet50()  \ninput_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\n\n#2 Run Speedster optimization\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    optimization_time=\"constrained\",\n    metric_drop_ths=0.05\n)\n\n#3 Save the optimized model\nsave_model(optimized_model, \"model_save_path\")\n```\n\nOnce the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.\n\n```python\n#4 Load and run your PyTorch accelerated model in production\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\")\n\noutput = optimized_model(input_sample)\n```\nFor more details, please visit [Getting Started with PyTorch Optimization](https://docs.nebuly.com/Speedster/getting_started/pytorch_getting_started/).\n    \n</details>\n<details>\n<summary>🤗 Hugging Face Transformers </summary>\n    \nIn this section, we will learn about the 4 main steps needed to optimize 🤗 Hugging Face Transformer models:\n\n1) Input your model and data\n2) Run the optimization\n3) Save your optimized model \n4) Load and run your optimized model in production\n\n* <details><summary><b>✅ For Decoder-only or Encoder-only architectures (Bert, GPT, etc)</b></summary>\n\n    ```python\n    from transformers import AlbertModel, AlbertTokenizer\n    from speedster import optimize_model, save_model\n\n    #1a. Provide input model: Load Albert as an example\n    model = AlbertModel.from_pretrained(\"albert-base-v1\")\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n\n    #1b. Dictionary input format (also string format is accepted, see the docs to learn more)\n    text = \"This is an example text for the huggingface model.\"\n    input_dict = tokenizer(text, return_tensors=\"pt\")\n    input_data = [input_dict for _ in range(100)]\n\n    #2 Run Speedster optimization (if input data is in string format, also the tokenizer \n    # should be given as input argument, see the docs to learn more)\n    optimized_model = optimize_model(\n        model, \n        input_data=input_data, \n        optimization_time=\"constrained\",\n        metric_drop_ths=0.05\n    )\n\n    #3 Save the optimized model\n    save_model(optimized_model, \"model_save_path\")\n    ```\n\n    Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.\n\n    ```python\n    #4 Load and run your Huggingface accelerated model in production\n    from speedster import load_model\n\n    optimized_model = load_model(\"model_save_path\")\n\n    output = optimized_model(**input_sample)\n    ```\n    For more details, please visit [Getting Started with HuggingFace optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/).\n\n    </details>\n\n* <details><summary><b>✅ For Encoder-Decoder architectures (T5 etc)</b></summary>\n\n\n    ```python\n    from transformers import T5Tokenizer, T5ForConditionalGeneration\n    from speedster import optimize_model, save_model\n\n    #1a. Provide input model: Load T5 as an example\n    model = T5ForConditionalGeneration.from_pretrained(\"t5-small\")\n    tokenizer = T5Tokenizer.from_pretrained(\"t5-small\") \n\n    #1b. Dictionary input format\n    question = \"What's the meaning of life?\"\n    answer = \"The answer is:\"\n    input_dict = tokenizer(question, return_tensors=\"pt\")\n    input_dict[\"decoder_input_ids\"] = tokenizer(answer, return_tensors=\"pt\").input_ids\n    input_data = [input_dict for _ in range(100)]\n\n    #2 Run Speedster optimization (if input data is in string format, also the tokenizer \n    # should be given as input argument, see the docs to learn more)\n    optimized_model = optimize_model(\n        model, \n        input_data=input_data, \n        optimization_time=\"constrained\",\n        metric_drop_ths=0.05\n    )\n\n    #3 Save the optimized model\n    save_model(optimized_model, \"model_save_path\")\n    ```\n\n    Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.\n\n    ```python\n    #4 Load and run your Huggingface accelerated model in production\n    from speedster import load_model\n\n    optimized_model = load_model(\"model_save_path\")\n\n    output = optimized_model(**input_sample)\n    ```\n    For more details, please visit [Getting Started with HuggingFace optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/).\n\n    </details>\n    \n</details>\n\n<details>\n<summary>🧨 Hugging Face Diffusers </summary>\n\n> :warning: In order to work properly, the diffusers optimization requires `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`. For additional details, please look the docs [here](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).\n\nIn this section, we will learn about the 4 main steps needed to optimize Stable Diffusion models from the Diffusers library:\n\n1) Input your model and data\n2) Run the optimization\n3) Save your optimized model \n4) Load and run your optimized model in production\n\n```python\nimport torch\nfrom diffusers import StableDiffusionPipeline\nfrom speedster import optimize_model, save_model\n\n#1 Provide input model and data\nmodel_id = \"CompVis/stable-diffusion-v1-4\"\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\nif device == \"cuda\":\n    # On GPU we load by default the model in half precision, because it's faster and lighter.\n    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\nelse:\n    pipe = StableDiffusionPipeline.from_pretrained(model_id)\n\n# Create some example input data\ninput_data = [\n    \"a photo of an astronaut riding a horse on mars\",\n    \"a monkey eating a banana in a forest\",\n    \"white car on a road surrounded by palm trees\",\n    \"a fridge full of bottles of beer\",\n    \"madara uchiha throwing asteroids against people\"\n]\n\n#2 Run Speedster optimization\noptimized_model = optimize_model(\n    model=pipe,\n    input_data=input_data,\n    optimization_time=\"unconstrained\",\n    ignore_compilers=[\"torch_tensor_rt\", \"tvm\"],\n    metric_drop_ths=0.1,\n)\n\n#3 Save the optimized model\nsave_model(optimized_model, \"model_save_path\")\n```\n\nOnce the optimization is completed, start using the accelerated model (on steroids 🚀).\n\n```python\n#4 Load and run your PyTorch accelerated model in production\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\", pipe=pipe)\n\ntest_prompt = \"futuristic llama with a cyberpunk city on the background\"\noutput = optimized_model(test_prompt).images[0]\n```\nFor more details, please visit [Getting Started with Stable Diffusion optimization](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).\n    \n</details>\n\n<details>\n<summary>🌊 TensorFlow/Keras </summary>\n    \nIn this section, we will learn about the 4 main steps needed to optimize TensorFlow/Keras models:\n\n1) Input your model and data\n2) Run the optimization\n3) Save your optimized model \n4) Load and run your optimized model in production\n\n```python\nimport tensorflow as tf\nfrom tensorflow.keras.applications.resnet50 import ResNet50\nfrom speedster import optimize_model, save_model\n\n#1 Provide input model and data (we support Keras dataset and custom input, see the docs to learn more)\nmodel = ResNet50() \ninput_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for _ in range(100)]\n\n#2 Run Speedster optimization\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    optimization_time=\"constrained\",\n    metric_drop_ths=0.05\n)\n\n#3 Save the optimized model\nsave_model(optimized_model, \"model_save_path\")\n```\n\nOnce the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.\n\n```python\n#4 Load and run your TensorFlow accelerated model in production\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\")\n\noutput = optimized_model(input_sample)\n```\nFor more details, please visit [Getting Started with TensorFlow optimization](https://docs.nebuly.com/Speedster/getting_started/tf_getting_started/).\n\n</details>\n<details>\n    \n<summary> ⚡ ONNX </summary>\n\nIn this section, we will learn about the 4 main steps needed to optimize ONNX models:\n\n1) Input your model and data\n2) Run the optimization\n3) Save your optimized model \n4) Load and run your optimized model in production\n\n```python\nimport numpy as np\nfrom speedster import optimize_model, save_model\n\n#1 Provide input model and data\n# Model was downloaded from here: \n# https://github.com/onnx/models/tree/main/vision/classification/resnet\nmodel = \"resnet50-v1-12.onnx\" \ninput_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for _ in range(100)]\n\n#2 Run Speedster optimization\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    optimization_time=\"constrained\",\n    metric_drop_ths=0.05\n)\n\n#3 Save the optimized model\nsave_model(optimized_model, \"model_save_path\")\n```\n\nOnce the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.\n\n```python\n#4 Load and run your ONNX accelerated model in production\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\")\n\noutput = optimized_model(input_sample)\n```\nFor more details, please visit [Getting Started with ONNX optimization](https://docs.nebuly.com/Speedster/getting_started/onnx_getting_started/).\n    \n</details>\n\n# **Documentation**\n\n- [Installation](https://docs.nebuly.com/Speedster/installation/)\n- [Getting started with PyTorch optimization](https://docs.nebuly.com/Speedster/getting_started/pytorch_getting_started/)\n- [Getting started with Hugging Face optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/)\n- [Getting started with Stable Diffusion optimization](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/)\n- [Getting started with TensorFlow optimization](https://docs.nebuly.com/Speedster/getting_started/tf_getting_started/)\n- [Getting started with ONNX optimization](https://docs.nebuly.com/Speedster/getting_started/onnx_getting_started/)\n- [Key concepts](https://docs.nebuly.com/Speedster/key_concepts/)\n- [Notebooks](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster)\n- [Advanced options](https://docs.nebuly.com/Speedster/advanced_options/)\n- [Benchmarks](https://docs.nebuly.com/Speedster/benchmarks/)\n\n\n# **Key concepts**\n\nSpeedster's design reflects our mission to automatically master each and every existing AI acceleration technique to deliver the most cost-efficient AI ever. As a result, `Speedster` leverages available enterprise-grade open-source optimization tools. If these tools and  communities already exist, and are distributed under a permissive license (Apache, MIT, etc), we integrate them and happily contribute to their communities. However, many tools do not exist yet, in which case we implement them and open-source the code so that our community can benefit from it.\n\n`Speedster` is shaped around **4 building blocks** and leverages a modular design to foster scalability and integration of new acceleration components across the software to hardware stack.\n\n- [x]  **Converter:** converts the input model from its original framework to the framework backends supported by `Speedster`, namely PyTorch, ONNX and TensorFlow. This allows the Compressor and Compiler modules to apply any optimization technique to the model.\n- [x]  **Compressor:** applies various compression techniques to the model, such as pruning, knowledge distillation, or quantization-aware training.\n- [x]  **Compiler:** converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The compilers apply both post-training quantization techniques and graph optimizations, to produce compiled binary files.\n- [x]  **Inference Learner:** takes the best performing compiled model and converts it back into the same interface as the original input model.\n\n![speedster_blocks](https://user-images.githubusercontent.com/42771598/213177175-a76908a2-5eef-4e82-9d54-0fc812131463.png)\n\nThe **compressor** stage leverages the following open-source projects:\n\n- [Intel/neural-compressor](https://github.com/intel/neural-compressor): targeting to provide unified APIs for network compression technologies, such as low precision quantization, sparsity, pruning, knowledge distillation, across different deep learning frameworks to pursue optimal inference performance.\n- [SparseML](https://github.com/neuralmagic/sparseml): libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models.\n\nThe **compiler stage** leverages the following open-source projects:\n\n- [Apache TVM](https://github.com/apache/tvm): open deep learning compiler stack for cpu, gpu and specialized accelerators.\n- [BladeDISC](https://github.com/alibaba/BladeDISC): end-to-end Dynamic Shape Compiler project for machine learning workloads.\n- [DeepSparse](https://github.com/neuralmagic/deepsparse): neural network inference engine that delivers GPU-class performance for sparsified models on CPUs.\n- [OpenVINO](https://github.com/openvinotoolkit/openvino): open-source toolkit for optimizing and deploying AI inference.\n- [ONNX Runtime](https://github.com/microsoft/onnxruntime): cross-platform, high performance ML inferencing and training accelerator\n- [TensorRT](https://github.com/NVIDIA/TensorRT): C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators.\n- [TFlite](https://github.com/tensorflow/tflite-micro) and [XLA](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): open-source libraries to accelerate TensorFlow models.\n\n\n\n# **Community**\nWe’re developing `Speedster` for and together with our community, so please get in touch on GitHub or Discord. \n\n• **[GitHub issues](https://github.com/nebuly-ai/nebullvm/issues)**: suggest new acceleration components, request new features, and report bugs and improvements.\n\n• **[Discord](https://discord.gg/RbeQMu886J)**: learn about AI acceleration, share exciting projects and hang out with our global community.\n\nThe best way to get started is to pick a good-first issue. Please read our [contribution guidelines](https://docs.nebuly.com/contributions/) for a deep dive into how to best contribute to our project!\n\nDon't forget to leave a star ⭐ to support the project and happy acceleration 🚀\n"
  },
  {
    "path": "optimization/speedster/docs/en/docs/advanced_options.md",
    "content": "# Advanced options\n\nIf you’re new to the library, you may want to start with the **Getting started** section.\n\nThe user guide here shows more advanced workflows and how to use the library in different ways. We are going to show some examples of more advanced usages of `Speedster`, that we hope will give you a deeper insight of how `Speedster` works. \n\nIn particular, we will overview:\n\n- [`optimize_model`](#optimizemodel-api) API\n- [Acceleration suggestions](#acceleration-suggestions)\n- [Selecting which device](#selecting-which-device-to-use--cpu-gpu-and-other-accelerators) to use: CPU, GPU and other accelerators\n- [Optimization Time: constrained vs unconstrained](#optimization-time--constrained-vs-unconstrained)\n- [Selecting specific compilers/compressors](#select-specific-compilerscompressors)\n- [Using dynamic shape](#using-dynamic-shape)\n- [Enable TensorrtExecutionProvider for ONNXRuntime on GPU](#enable-tensorrtexecutionprovider-for-onnxruntime-on-gpu)\n- [Custom models](#custom-models)\n- [Store the performances of all the optimization techniques](#store-the-performances-of-all-the-optimization-techniques)\n- [Set number of threads](#set-number-of-threads)\n\n## `optimize_model` API\n\nThe `optimize_model` function allows to optimize a model from one of the supported frameworks (PyTorch, HuggingFace, TensorFlow, ONNX), and returns an optimized model that can be used with the same interface as the original model.\n\n```python\ndef optimize_model(\n        model: Any,\n        input_data: Union[Iterable, Sequence],\n        metric_drop_ths: Optional[float] = None,\n        metric: Union[str, (...) -> Any, None] = None,\n        optimization_time: str = \"constrained\",\n        dynamic_info: Optional[dict] = None,\n        config_file: Optional[str] = None,\n        ignore_compilers: Optional[List[str]] = None,\n        ignore_compressors: Optional[List[str]] = None,\n        store_latencies: bool = False,\n        device: str = None,\n        **kwargs: Any\n) -> Any\n```\n\n**Arguments**\n\n`model`: Any\n\nThe input model can belong to one of the following frameworks: PyTorch, TensorFlow, ONNX, HuggingFace. In the ONNX case, `model` is a string with the path to the saved onnx model. In the other cases, it is a torch.nn.Module or a tf.Module.\n\n`input_data`: Iterable or Sequence\n\nInput data needed to test the optimization performances (latency, throughput, accuracy loss, etc). It can consist of one or more data samples. Note that if `optimization_time` is set to \"unconstrained,\" it would be preferable to provide at least 100 data samples to also activate `Speedster` techniques that require more data (pruning, etc.). See the Getting started section to learn more about the `input_data` depending on your input framework:\n\n- [Getting started with PyTorch optimization](getting_started/pytorch_getting_started.md#1-input-model-and-data)\n- [Getting started with 🤗 HuggingFace optimization](getting_started/hf_getting_started.md#1-input-model-and-data)\n- [Getting started with Stable Diffusion optimization](getting_started/diffusers_getting_started.md#1-input-model-and-data)\n- [Getting started with TensorFlow/Keras optimization](getting_started/tf_getting_started.md#1-input-model-and-data)\n- [Getting started with ONNX optimization](getting_started/onnx_getting_started.md#1-input-model-and-data)\n\n`metric_drop_ths`: float, optional\n\nMaximum drop in your preferred metric (see \"metric\" section below). All the optimized models having a larger error with respect to the `metric_drop_ths` will be discarded. \n\nDefault: 0.\n\n`metric`: Callable, optional\n\nMetric to be used for estimating the error that may arise from using optimization techniques and for evaluating if the error exceeds the `metric_drop_ths`.  `metric` accepts as input a string, a user-defined metric, or None. Metric accepts a string containing the name of the metric; it currently supports:\n\n- \"numeric_precision\"\n- \"accuracy\". \n- user-defined metric: function that takes as input the output of the original model and the one of the optimized model, and, if available, the original label. The function calculates and returns the reduction in the metric due to the optimization. \n\nDefault: \"numeric_precision\". \n\n`optimization_time`: OptimizationTime, optional\n\nThe optimization time mode. It can be \"constrained\" or \"unconstrained\". In \"constrained\" mode, Speedster takes advantage only of compilers and precision reduction techniques, such as quantization. \"unconstrained\" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation. Note that most techniques activated in \"unconstrained\" mode require fine-tuning, and therefore it is recommended to provide at least 100 samples as input_data. \n\nDefault: \"constrained\".\n\n`dynamic_info`: Dict, optional\n\nDictionary containing dynamic axis information. It should contain as keys both \"input\" and \"output\" and as values two lists of dictionaries, where each dictionary represents dynamic axis information for an input/output tensor. The inner dictionary should have an integer as a key, i.e. the dynamic axis (also considering the batch size) and a string as a value giving it a tag, e.g., \"batch_size.\". \n\nDefault: None.\n\n`config_file`: str, optional\n\nConfiguration file containing the parameters needed to define the CompressionStep in the pipeline. \n\nDefault: None.\n\n`ignore_compilers`: List[str], optional\n\nList of DL compilers ignored during optimization execution. The compiler name should be one among tvm, tensor RT, openvino, onnxruntime, deepsparse, tflite, bladedisc, torchscript, intel_neural_compressor . \n\nDefault: None.\n\n`ignore_compressors`: List[str], optional\n\nList of DL compressors ignored during the compression stage. The compressor name should be one among sparseml and intel_pruning. \n\nDefault: None.\n\n`store_latencies`: bool, optional\n\nParameter that allows to store the latency for each compiler used by Speedster in a json file. The JSON is created in the working directory. \n\nDefault: False.\n\n`device`: str, optional\n\nDevice used for inference, it can be cpu or gpu/cuda (both gpu and cuda options are supported). A specific gpu can be selected using notation gpu:1 or cuda:1. gpu will be used if available, otherwise cpu. \n\nDefault: None.\n\n**Returns: Inference Learner**\n\nOptimized version with the same interface of the input model. For example, optimizing a PyTorch model will return an InferenceLearner object that can be called exactly like a PyTorch model (either with model.forward(input) or model(input)). The optimized model will therefore take as input a torch.Tensors and return a torch.Tensors.\n\n## Acceleration suggestions\n\nIf the speedup you obtained with the first optimization with `Speedster` is not enough, we suggest the following actions:\n\n- Include more backends for optimization, i.e. set `--backend all`\n- Increase the `metric_drop_ths` by 5%, if possible: see [Optimize_model API](#optimize_model-api)\n- Verify that your device is supported by your version of speedster: see [Supported hardware](hardware.md)\n- Try to accelerate your model on a different hardware or consider using the CloudSurfer module to automatically understand which is the best hardware for your model: see [CloudSurfer](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/cloud_surfer) module.\n\n## Selecting which device to use: CPU, GPU and other accelerators.\n\nSpeedster currently supports the following devices: `CPUs`, `GPUs`, `TPUs` and `AWS Inferentia chips`.\n\nThe parameter `device` allows to select which device we want to use for inference. By default, `Speedster` will use the accelerator if available on the machine, otherwise it will use cpu. If we are running on a machine with an available accelerator and we want to optimize the model for cpu inference, we can use:\n\n```python\nfrom speedster import optimize_model\n\noptimized_model = optimize_model(\n  model, input_data=input_data, device=\"cpu\"\n)\n```\n\nIf we are working on a multi-gpu machine and we want to use a specific gpu, we can use:\n\n```python\nfrom speedster import optimize_model\n\noptimized_model = optimize_model(\n  model, input_data=input_data, device=\"cuda:1\"  # also device=\"gpu:1\" is supported\n)\n```\n\nThe same applies also for TPUs and AWS Inferentia chips: \n\n```python\nfrom speedster import optimize_model\n\noptimized_model = optimize_model(\n  model, input_data=input_data, device=\"tpu:1\"  # use tpu #1\n)\n\noptimized_model = optimize_model(\n  model, input_data=input_data, device=\"neuron:1\"  # use Inferentia chip #1\n)\n```\n\n## Optimization Time: constrained vs unconstrained\n\nOne of the first options that can be customized in `Speedster` is the `optimization_time` parameter. In order to optimize the model, `Speedster` will try a list of compilers which allow to keep the same accuracy of the original model. In addition to compilers, it can also use other techniques such as pruning, quantization, and other compression techniques which can lead to a little drop in accuracy and may require some time to complete. \n\nWe defined two scenarios:\n\n- **constrained**: only compilers and precision reduction techniques are used, so the compression step (the most time consuming one) is skipped. Moreover, in some cases the same compiler could be available for more than one pipeline, for example tensor RT is available both with PyTorch and ONNX backends. In the constrained scenario, each compiler will be used only once, so if for example we optimize a PyTorch model and tensor RT in the PyTorch pipeline manages to optimize the model, it won't be used again in the ONNX pipeline.\n\n- **unconstrained**: in this scenario, `Speedster` will use all the compilers available, even if they appear in more than one backend. It also allows the usage of more time consuming techniques such as pruning and distillation. Note that for using many of the sophisticated techniques in the 'unconstrained' optimization, a small fine-tuning of the model will be needed. Thus, we highly recommend to provide as input_data at least 100 samples when selecting 'unconstrained' optimization.\n\n\n##  Select specific compilers/compressors\n\nThe `optimize_model` functions accepts also the parameters `ignore_compilers` and `ignore_compressors`, which allow to skip specific compilers or compressors. \nThe full list of available options is the following:\n- _ignore_compilers_: `deepsparse`, `tensor_rt`, `torch_tensor_rt`, `onnx_tensor_rt`, `torchscript`, `onnxruntime`, `tflite`, `tvm`, `onnx_tvm`, `torch_tvm`, `bladedisc`, `openvino`, `intel_neural_compressor`, `torch_xla`, `torch_neuron`.\n- _ignore_compressors_: `sparseml`, `intel_pruning`.\n\nSome compilers, such as tensor RT, are available for both PyTorch and ONNX backends. For this reason in the list of compilers we have `tensor_rt` which skips both the PyTorch and ONNX pipelines, and `torch_tensor_rt` and `onnx_tensor_rt` which skip only the PyTorch and ONNX pipelines respectively.\n\nIf we want to skip the `tvm` and `bladedisc` optimizers, we could write:\n\n```python\nfrom speedster import optimize_model\n\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    ignore_compilers=[\"tvm\", \"bladedisc\"]\n)\n```\n\n## Using dynamic shape\n\nBy default, a model optimized with `Speedster` will have a static shape. This means that it can be used in inference only with the same shape of the inputs provided to the `optimize_model` function during the optimization. The dynamic shape however is fully supported, and can be enabled with the `dynamic_info` parameter (see the [optimize_model API](#optimize_model-api) arguments to see how this parameter is defined.)\n\nFor each dynamic axis in the inputs, we need to provide the following information:\n- the axis number (starting from 0, considering the batch size as the first axis)\n- a tag that will be used to identify the axis\n- the minimum, optimal and maximum sizes of the axis (some compilers will work also for shapes that are not in the range [min, max], but the performance may be worse)\n\nLet's see an example of a model that takes two inputs, where the batch size must be dynamic, as well as the size on the third and fourth dimensions.\n\n```python\nimport torch\nimport torchvision.models as models\nfrom speedster import optimize_model\n\n# Load a resnet as example\nmodel = models.resnet50()\n\n# Provide an input data for the model\ninput_data = [((torch.randn(1, 3, 256, 256),), torch.tensor([0])) for _ in range(100)]\n\n# Set dynamic info\ndynamic_info = {\n    \"inputs\": [\n        {\n            0: {\n                \"name\": \"batch\",\n                \"min_val\": 1,\n                \"opt_val\": 1,\n                \"max_val\": 8,\n            }, \n            2: {\n                \"name\": \"dim_image\",\n                \"min_val\": 128,\n                \"opt_val\": 256,\n                \"max_val\": 512,\n            }, \n            3: {\n                \"name\": \"dim_image\",\n                \"min_val\": 128,\n                \"opt_val\": 256,\n                \"max_val\": 512,\n            }, \n        }\n    ],\n    \"outputs\": [\n        {0: \"batch\", 1: \"out_dim\"}\n    ]\n}\n\n# Run Speedster optimization in one line of code\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    optimization_time=\"constrained\", \n    dynamic_info=dynamic_info\n)\n```\n\n## Enable TensorrtExecutionProvider for ONNXRuntime on GPU\n\nBy default, `Speedster` will use the `CUDAExecutionProvider` for ONNXRuntime on GPU. If you want to use the `TensorrtExecutionProvider` instead, you must add the TensorRT installation path to the env variable LD_LIBRARY_PATH.\nIf you installed TensorRT through the nebullvm auto_installer, you can do it by running the following command in the terminal:\n\n```bash\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:\"/<PATH_TO_PYTHON_FOLDER>/site-packages/tensorrt\"\n```\n\n## Custom models\n\n`Speedster` is designed to optimize models that take as inputs and return in output only tensors or np.ndarrays (and dictionaries/strings for huggingface). Some models may require instead a custom input, for example a dictionary where the keys are the names of the inputs and the values are the input tensors, or may return a dictionary as output. We can optimize such models with `Speedster` by defining a model wrapper.\n\nLet's take the example of the detectron2 model which takes as input a tuple of tensors but returns a dictionary as output:\n\n```python\n class BaseModelWrapper(torch.nn.Module):\n    def __init__(self, core_model, output_dict):\n        super().__init__()\n        self.core_model = core_model\n        self.output_names = [key for key in output_dict.keys()]\n    \n    def forward(self, *args, **kwargs):\n        res = self.core_model(*args, **kwargs)\n        return tuple(res[key] for key in self.output_names)\n\n\nclass OptimizedWrapper(torch.nn.Module):\n    def __init__(self, optimized_model, output_keys):\n        super().__init__()\n        self.optimized_model = optimized_model\n        self.output_keys = output_keys\n    \n    def forward(self, *args):\n        res = self.optimized_model(*args)\n        return {key: value for key, value in zip(self.output_keys, res)}\n\ninput_data = [((torch.randn(1, 3, 256, 256)), torch.tensor([0]))]\n\n# Compute the original output of the model (in dict format) \nres = model_backbone(torch.randn(1, 3, 256, 256))\n\n# Pass the model and the output sample to the wrapper\nbackbone_wrapper = BaseModelWrapper(model_backbone, res)\n\n# Optimize the model wrapper\noptimized_model = optimize_model(backbone_wrapper, input_data=input_data)\n\n# Wrap the optimized model with a new wrapper to restore the original model output format\noptimized_backbone = OptimizedWrapper(optimized_model, backbone_wrapper.output_names)\n\n```\n\nYou can find other examples in the [notebooks](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) section available on GitHub.\n\n## Store the performances of all the optimization techniques\n\n`Speedster` internally tries all the techniques available on the target hardware and automatically chooses the fastest one. If you need more details on the inference times of each compiler, you can set the `store_latencies` parameter to `True`. A json file will be created in the working directory, listing all the results of the applied techniques and of the original model itself.\n\n```python\n# Run Speedster optimization in one line of code\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    store_latencies=True\n)\n```\n\n## Set number of threads\nWhen running multiple replicas of the model in parallel, it would be useful for CPU-optimized algorithms to limit the number of threads to use for each model. In `Speedster`, it is possible to set the maximum number of threads a single model can use with the environment variable `NEBULLVM_THREADS_PER_MODEL`. \n\nFor instance, you can run:\n\n```python\nexport NEBULLVM_THREADS_PER_MODEL = 2\n```\n\nfor using just two CPU threads per model at inference time and during optimization."
  },
  {
    "path": "optimization/speedster/docs/en/docs/benchmarks.md",
    "content": "# Benchmarks\n\n!!! info\n    In this section you are going to learn how `Speedster` accelerates the inference of various models on different hardware architecture.\n\nHere we provide a preview of the following accelerated models:\n\n- [Bert](#bert)\n- [YoloV5](#yolov5)\n- [EfficientNet](#efficientnet)\n- [GPT2](#gpt2)\n- [ResNet](#resnet)\n- [Roberta](#roberta)\n\nThe above models are tested on very popular hardware architecture and instances:\n\n- AWS - c5n,2xlarge\n- AWS - c5,12xlarge\n- AWS - c6i.12xlarge\n- AWS - m6i,24xlarge\n- NVIDIA T4\n- NVIDIA V100\n- NVIDIA 3090\n\n## Bert\n![bert](images/bert.png)\n\n## YoloV5\n![yolo](images/yolov5.png)\n\n## EfficientNet\n![yolo](images/efficientnet.png)\n\n## GPT2\n![yolo](images/gpt2.png)\n\n## ResNet\n![yolo](images/resnet.png)\n\n## Roberta\n![yolo](images/roberta.png)"
  },
  {
    "path": "optimization/speedster/docs/en/docs/getting_started/diffusers_getting_started.md",
    "content": "# Getting started with Stable Diffusion optimization\nIn this section, we will learn about the 4 main steps needed to optimize Stable Diffusion models from the `Diffusers` library:\n\n1. [Environment Setup](#1-input-model-and-data)\n2. [Input your model and data](#2-input-model-and-data)\n3. [Run the optimization](#3-run-the-optimization)\n4. [Save your optimized model](#4-save-your-optimized-model)\n5. [Load and run your optimized model in production](#5-load-and-run-your-optimized-model-in-production)\n\n## 1) Environment Setup (GPU only)\nIn order to optimize a Stable Diffusion model, you have to ensure that your environment is correctly set up according to these requirements: `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`.\n\nFrom TensorRT 8.6, all the tensorrt pre-built wheels released by nvidia support only `CUDA>=12.0`. Speedster will install `tensorrt>=8.6.0` automatically in the auto-installer only if it detects CUDA>=12.0, otherwise it will install `tensorrt==8.5.3.1`. In that case, you will have to upgrade your CUDA version and then to upgarde tensorrt to 8.6.0 or above.\n\nThere should be a way to run TensorRT 8.6 also with CUDA 11, but it requires installing TensorRT in a different way, you can check this issue: https://github.com/NVIDIA/TensorRT/issues/2773. Otherwise, we highly suggest to just upgrade to CUDA 12.\n\nFor now PyTorch>=2.0.0 is not supported due to an [issue](https://github.com/pytorch/pytorch/issues/97262) in the conversion to onnx, so until they fix it you must have torch<=1.13.1 to optimize Stable Diffusion successfully.\n\nYou can check your CUDA version with the following command:\n\n```bash\nnvidia-smi\n```\n\nIf you have CUDA<12.0, you can upgrade it at this link: https://developer.nvidia.com/cuda-downloads\n\nYou can check your TensorRT version with the following command:\n\n```bash\npython -c \"import tensorrt; print(tensorrt.__version__)\"\n```\n\nIf you have an older version, after ensuring you have `CUDA>=12.0` installed, you can upgrade your TensorRT version by running:\n```\npip install -U tensorrt\n```\n\nYou can finally check your PyTorch version  with the command\n```bash\npython -c \"import torch; print(torch.__version__)\"\n```\nIf you have torch>=2.0.0, you can downgrade it by running:\n```\npip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117\n```\n\n## 2) Input model and data\n\n!!! info\n    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). \n\n\nFor Stable Diffusion models Speedster expects the input data to be a list of sentences: ```List[str]```\n\n```python\nimport torch\nfrom speedster import optimize_model\nfrom diffusers import StableDiffusionPipeline\n\n\n# Load Stable Diffusion 1.4 as example\nmodel_id = \"CompVis/stable-diffusion-v1-4\"\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\nif device == \"cuda\":\n    # On GPU we load by default the model in half precision, because it's faster and lighter.\n    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\nelse:\n    pipe = StableDiffusionPipeline.from_pretrained(model_id)\n\n# Create some example input data\ninput_data = [\n    \"a photo of an astronaut riding a horse on mars\",\n    \"a monkey eating a banana in a forest\",\n    \"white car on a road surrounded by palm trees\",\n    \"a fridge full of bottles of beer\",\n    \"madara uchiha throwing asteroids against people\"\n]\n```\n\nNow your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.\n\n## 3) Run the optimization\nOnce the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. \n\nThe function takes the following arguments as inputs:\n\n- `model`: model to be optimized in your preferred framework (A Diffusers pipe in this case)\n- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)\n- `optimization_time`: if \"constrained\" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. \"unconstrained\" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation \n- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration\n\nand returns the accelerated version of your model 🚀.\n\n``` python\nfrom speedster import optimize_model\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n    pipe, \n    input_data=input_data, \n    optimization_time=\"unconstrained\",\n    metric_drop_ths=0.05\n)\n```\n\nInternally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.\n\nAt the end of the optimization, you are going to see the results in a summary table like the following:\n\n![pt](../images/stable_diffusion.png)\n\nIf the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.\n\nIf you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.\n\n## 4) Save your optimized model\nAfter accelerating the model, it can be saved using the `save_model` function:\n\n```python\nfrom speedster import save_model\n\nsave_model(optimized_model, \"model_save_path\")\n```\n\nNow you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.\n\n## 5) Load and run your optimized model in production\nOnce the optimized model has been saved,  it can be loaded with the `load_model` function:\n```python\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\", pipe=pipe)\n```\n\nIn this case we must provide also the original pipe as argument to the load_function, Speedster will automatically load the optimized model and replace the original UNet inside the pipe.\n\nThe optimized model can be used for accelerated inference in the same way as the original model:\n\n```python\n# Use the accelerated version of your Stable Diffusion model in production\noutput = optimized_model(test_prompt).images[0]\n```\n\n!!! info\n    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.\n\nIf you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section."
  },
  {
    "path": "optimization/speedster/docs/en/docs/getting_started/hf_getting_started.md",
    "content": "# Getting started with HuggingFace optimization\nIn this section, we will learn about the 4 main steps needed to optimize your 🤗 HuggingFace models:\n\n1. [Input your model and data](#1-input-model-and-data)\n2. [Run the optimization](#2-run-the-optimization)\n3. [Save your optimized model](#3-save-your-optimized-model)\n4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)\n\n## 1) Input model and data\n\n!!! info\n    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). \n\nFor HuggingFace models we support different types of input data depending on the architecture of your input model.\n\n- [x]  For Decoder-only or Encoder-only architectures (Bert, GPT, etc), we support:\n\n    - Dictionary\n    - String\n\n- [x]  For Encoder-Decoder architectures (T5 etc), we support: \n    - Dictionary\n\n\n=== \"Decoder-only or Encoder-only (Bert, GPT, etc)\"\n    **Input as Dictionary**\n\n    ```python\n    from transformers import AlbertModel, AlbertTokenizer\n\n    # Load Albert as example\n    model = AlbertModel.from_pretrained(\"albert-base-v1\")\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n\n    # Case 1: dictionary input format\n    text = \"This is an example text for the huggingface model.\"\n    input_dict = tokenizer(text, return_tensors=\"pt\")\n    input_data = [input_dict for _ in range(100)]\n    ```\n    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.\n\n\n    **Input as String**\n\n    In the string case, the HuggingFace tokenizer must be given as input to the `optimize_model` in addition to the `input_data`, and the arguments for the tokenizer can be passed using the param `tokenizer_args`.\n\n    ```python\n    from transformers import AlbertModel, AlbertTokenizer\n\n    # Load Albert as example\n    model = AlbertModel.from_pretrained(\"albert-base-v1\")\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n\n    # Case 2: strings input format\n    input_data = [\n        \"This is a test.\",\n        \"Hi my name is John.\",\n        \"The cat is on the table.\",\n    ]\n    tokenizer_args = dict(\n        return_tensors=\"pt\",\n        padding=\"longest\",\n        truncation=True,\n    )\n    ```\n    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.\n\n=== \"Encoder-Decoder architectures (T5 etc)\"\n    For encoder-decoder architectures we support only `input_data` as Dictionary:\n    ```python\n    from transformers import T5Tokenizer, T5ForConditionalGeneration\n\n    # Load T5 as example\n    model = T5ForConditionalGeneration.from_pretrained(\"t5-small\")\n    tokenizer = T5Tokenizer.from_pretrained(\"t5-small\") \n\n    # Case 1: dictionary input format\n    question = \"What's the meaning of life?\"\n    answer = \"The answer is:\"\n    input_dict = tokenizer(question, return_tensors=\"pt\")\n    input_dict[\"decoder_input_ids\"] = tokenizer(answer, return_tensors=\"pt\").input_ids\n    input_data = [input_dict for _ in range(100)]\n    ```\n    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.\n\n\n## 2) Run the optimization\nOnce the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. \n\nThe function takes the following arguments as inputs:\n\n- `model`: model to be optimized in your preferred framework (HuggingFace in this case)\n- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)\n- `optimization_time`: if \"constrained\" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. \"unconstrained\" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation \n- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration\n\nand returns the accelerated version of your model 🚀.\n\nDepending on the format of your `input_data`, the `optimize_model` is as follows:\n\n=== \"Input as Dictionary\"\n    ```python\n    from speedster import optimize_model\n\n    # Run Speedster optimization\n    optimized_model = optimize_model(\n        model, \n        input_data=input_data, \n        optimization_time=\"constrained\",\n        metric_drop_ths=0.05\n    )\n    ```\n\n=== \"Input as String\"\n    ```python\n    from speedster import optimize_model\n\n    # Run Speedster optimization\n    optimized_model = optimize_model(\n        model, \n        input_data=input_data, \n        optimization_time=\"constrained\", \n        metric_drop_ths=0.05,\n        tokenizer=tokenizer,\n        tokenizer_args={\"return_tensors\": \"pt\"}\n    )\n    ```\n\nInternally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.\n\nAt the end of the optimization, you are going to see the results in a summary table like the following:\n\n![pt](../images/pt_table.png)\n\nIf the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.\n\nIf you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.\n\n## 3) Save your optimized model\nAfter accelerating the model, it can be saved using the `save_model` function:\n\n```python\nfrom speedster import save_model\n\nsave_model(optimized_model, \"model_save_path\")\n```\n\nNow you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.\n\n## 4) Load and run your optimized model in production\nOnce the optimized model has been saved,  it can be loaded with the `load_model` function:\n```python\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\")\n```\n\nThe optimized model can be used for accelerated inference in the same way as the original model:\n\n```python\n# Use the accelerated version of your HuggingFace model in production\noutput = optimized_model(**input_sample)\n```\n\n!!! info\n    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.\n\nIf you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section."
  },
  {
    "path": "optimization/speedster/docs/en/docs/getting_started/onnx_getting_started.md",
    "content": "# Getting started with ONNX optimization\nIn this section, we will learn about the 4 main steps needed to optimize your ONNX models:\n\n1. [Input your model and data](#1-input-model-and-data)\n2. [Run the optimization](#2-run-the-optimization)\n3. [Save your optimized model](#3-save-your-optimized-model)\n4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)\n\n## 1) Input model and data\n\n!!! info\n    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). \n\n```python\nimport numpy as np\n\n# Load a resnet as example\n# Model was downloaded from here: \n# https://github.com/onnx/models/tree/main/vision/classification/resnet\nmodel = \"resnet50-v1-12.onnx\"\n\n# Provide input data for the model    \ninput_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for _ in range(100)]\n```\n\nNow your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.\n\n## 2) Run the optimization\nOnce the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. \n\nThe function takes the following arguments as inputs:\n\n- `model`: model to be optimized in your preferred framework (ONNX in this case)\n- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)\n- `optimization_time`: if \"constrained\" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. \"unconstrained\" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation \n- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration\n\nand returns the accelerated version of your model 🚀.\n\n``` python\nfrom speedster import optimize_model\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    optimization_time=\"constrained\",\n    metric_drop_ths=0.05\n)\n```\n\nInternally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.\n\nAt the end of the optimization, you are going to see the results in a summary table like the following:\n\n![pt](../images/pt_table.png)\n\nIf the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.\n\nIf you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.\n\n## 3) Save your optimized model\nAfter accelerating the model, it can be saved using the `save_model` function:\n\n```python\nfrom speedster import save_model\n\nsave_model(optimized_model, \"model_save_path\")\n```\n\nNow you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.\n\n## 4) Load and run your optimized model in production\nOnce the optimized model has been saved,  it can be loaded with the `load_model` function:\n```python\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\")\n```\n\nThe optimized model can be used for accelerated inference in the same way as the original model:\n\n```python\n# Use the accelerated version of your ONNX model in production\noutput = optimized_model(input_sample)\n```\n\n!!! info\n    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.\n\nIf you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section."
  },
  {
    "path": "optimization/speedster/docs/en/docs/getting_started/pytorch_getting_started.md",
    "content": "# Getting started with PyTorch optimization\nIn this section, we will learn about the 4 main steps needed to optimize PyTorch models:\n\n1. [Input your model and data](#1-input-model-and-data)\n2. [Run the optimization](#2-run-the-optimization)\n3. [Save your optimized model](#3-save-your-optimized-model)\n4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)\n\n## 1) Input model and data\n\n!!! info\n    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). \n\n\nFor PyTorch models we support two types of input data:\n\n* Custom data format\n* PyTorch DataLoader\n\n=== \"Custom Data Format\"\n    Input data is a ```List[Tuple[Tuple[tensor, ...], tensor]]```\n\n    - Each element of the list is a tuple, which represents a batch of the dataset.\n    - In each tuple, the first element is another tuple containing a value for each input tensor of the model, while the second element is a tensor containing the labels of that batch of data. The label is optional, so it can be omitted.\n\n    ``` python\n    import torch\n    import torchvision.models as models\n\n    # Load a resnet as example\n    model = models.resnet50()\n\n    # Provide input data for the model    \n    input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\n    ```\n\n    See below further examples with custom format:\n    ``` python\n    # Dataset for a model that takes 1 input, containing 100 batches of data with bs=1 with labels\n    input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\n\n    # Dataset for a model that takes 2 inputs, containing 100 batches of data with bs=5 with labels\n    input_data = [((torch.randn(5, 3, 256, 256), torch.randn(5, 3, 256, 256), ), torch.tensor([0, 1, 0, 1, 1])) for _ in range(100)]\n\n    # Dataset for a model that takes 1 input, containing 100 batches of data with bs=1 without labels\n    input_data = [((torch.randn(1, 3, 256, 256), ), ) for _ in range(100)]\n    ```\n\n    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.\n\n=== \"PyTorch DataLoader\"\n    We support the following DataLoader types:\n\n    * Tensor only\n    * Tensor and labels\n\n\n    For models with multiple inputs, we support the following types:\n\n    - input_1, input_2, ..., input_n, label\n    - (input_1, input_2, ..., input_n), label\n\n    ```python\n    import torch\n    import torchvision.models as models\n\n    # Load a resnet as example\n    model = models.resnet50()\n\n    # Use your PyTorch DataLoader in any of the standard format\n    input_data = <insert your PyTorch DataLoader here>\n    ```\n\n    Now your input `model` and `input_data` are ready, you can move on to the [Run the optimization](#2-run-the-optimization) section.\n\n## 2) Run the optimization\nOnce the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. \n\nThe function takes the following arguments as inputs:\n\n- `model`: model to be optimized in your preferred framework (PyTorch in this case)\n- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)\n- `optimization_time`: if \"constrained\" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. \"unconstrained\" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation \n- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration\n\nand returns the accelerated version of your model 🚀.\n\n``` python\nfrom speedster import optimize_model\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    optimization_time=\"constrained\",\n    metric_drop_ths=0.05\n)\n```\n\nInternally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.\n\nAt the end of the optimization, you are going to see the results in a summary table like the following:\n\n![pt](../images/pt_table.png)\n\nIf the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.\n\nIf you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.\n\n## 3) Save your optimized model\nAfter accelerating the model, it can be saved using the `save_model` function:\n\n```python\nfrom speedster import save_model\n\nsave_model(optimized_model, \"model_save_path\")\n```\n\nNow you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.\n\n## 4) Load and run your optimized model in production\nOnce the optimized model has been saved,  it can be loaded with the `load_model` function:\n```python\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\")\n```\n\nThe optimized model can be used for accelerated inference in the same way as the original model:\n\n```python\n# Use the accelerated version of your PyTorch model in production\noutput = optimized_model(input_sample)\n```\n\n!!! info\n    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.\n\nIf you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section."
  },
  {
    "path": "optimization/speedster/docs/en/docs/getting_started/tf_getting_started.md",
    "content": "# Getting started with TensorFlow optimization\nIn this section, we will learn about the 4 main steps needed to optimize TensorFlow models:\n\n1. [Input your model and data](#1-input-model-and-data)\n2. [Run the optimization](#2-run-the-optimization)\n3. [Save your optimized model](#3-save-your-optimized-model)\n4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)\n\n## 1) Input model and data\n\n!!! info\n    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). \n\nFor TensorFlow models we support two types of input data:\n\n* Custom data format\n* TensorFlow DataLoader\n\n=== \"Custom Data Format\"\n    Input data is a ```List[Tuple[Tuple[tensor, ...], tensor]]```\n\n    - Each element of the list is a tuple, which represents a batch of the dataset.\n    - In each tuple, the first element is another tuple containing a value for each input tensor of the model, while the second element is a tensor containing the labels of that batch of data. The label is optional, so it can be omitted.\n\n    ``` python\n    import tensorflow as tf\n    from tensorflow.keras.applications.resnet50 import ResNet50\n\n    # Load a resnet as example\n    model = ResNet50()\n\n    # Provide input data for the model    \n    input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for _ in range(100)]\n    ```\n\n    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.\n\n=== \"TensorFlow DataLoader\"\n    We support the following DataLoader types:\n\n    * Tensor only\n    * Tensor and labels\n\n\n    For models with multiple inputs, we support the following types:\n\n    - input_1, input_2, ..., input_n, label\n    - (input_1, input_2, ..., input_n), label\n\n    ```python\n    import torch\n    import torchvision.models as models\n\n    # Load a resnet as example\n    model = models.resnet50()\n\n    # Use your TensorFlow DataLoader in any of the standard format\n    input_data = <insert your TensorFlow DataLoader here>\n    ```\n\n    Now your input `model` and `input_data` are ready, you can move on to the [Run the optimization](#2-run-the-optimization) section.\n\n## 2) Run the optimization\nOnce the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. \n\nThe function takes the following arguments as inputs:\n\n- `model`: model to be optimized in your preferred framework (TensorFlow in this case)\n- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)\n- `optimization_time`: if \"constrained\" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. \"unconstrained\" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation \n- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration\n\nand returns the accelerated version of your model 🚀.\n\n``` python\nfrom speedster import optimize_model\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n    model, \n    input_data=input_data, \n    optimization_time=\"constrained\",\n    metric_drop_ths=0.05\n)\n```\n\nInternally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.\n\nAt the end of the optimization, you are going to see the results in a summary table like the following:\n\n![pt](../images/hf_table.png)\n\nIf the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.\n\nIf you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.\n\n## 3) Save your optimized model\nAfter accelerating the model, it can be saved using the `save_model` function:\n\n```python\nfrom speedster import save_model\n\nsave_model(optimized_model, \"model_save_path\")\n```\n\nNow you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.\n\n## 4) Load and run your optimized model in production\nOnce the optimized model has been saved,  it can be loaded with the `load_model` function:\n\n```python\nfrom speedster import load_model\n\noptimized_model = load_model(\"model_save_path\")\n```\n\nThe optimized model can be used for accelerated inference in the same way as the original model:\n\n```python\n# Use the accelerated version of your TensorFlow model in production\noutput = optimized_model(input_sample)\n```\n\n!!! info\n    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.\n\nIf you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.\n"
  },
  {
    "path": "optimization/speedster/docs/en/docs/hardware.md",
    "content": "# Supported hardware\n\n`Speedster` has been mostly tested on Nvidia GPUs and Intel/AMD CPUs. The library may also work with other hardware on which has not been tested. Please let us know if you find out that `Speedster` works well on other hardware or if you find issues.\n\nFully supported hardware:\n\n- Intel CPU\n- Nvidia GPU\n\nHardware we are currently integrating:\n\n- Apple M1\n- AMD CPU\n- Intel GPU (open issue 👩‍💻)"
  },
  {
    "path": "optimization/speedster/docs/en/docs/installation.md",
    "content": "# Installation\nIn this installation guide we will learn:\n\n- [Quick installation](#quick-installation) of `Speedster` with pip **(Recommended)** \n\n- [Selective installation](#optional-selective-installation-of-speedster-requirements) of the requirements **(Optional)**\n\n- [Installation](#optional-download-docker-images-with-frameworks-and-optimizers) with Docker **(Optional)** \n\n- [Set up Speedster on custom DL devices](#set-up-speedster-on-custom-dl-devices) to run models on Google TPUs and AWS Inferentia Chips\n\n\n## Quick installation \nYou can easily install `Speedster` using pip.\n\n    pip install speedster\n\nThen make sure to install all the available deep learning compilers:\n\n    python -m nebullvm.installers.auto_installer --compilers all\n\n\n!!! info\n    If you want to optimize PyTorch or HuggingFace models, PyTorch must be pre-installed in the environment before using the auto-installer, please install it from [this](https://pytorch.org/get-started/locally/) link. Moreover, for Mac computers with M1/M2 processors, please use a conda environment, or you may run into problems when installing some of the deep learning compilers.\n\nGreat, now you are ready to accelerate your model 🚀 Please visit the following pages to get started based on the DL framework of your input model:\n\n- [Getting started with PyTorch optimization](getting_started/pytorch_getting_started.md)\n- [Getting started with 🤗 Hugging Face optimization](getting_started/hf_getting_started.md)\n- [Getting started with Stable Diffusion optimization](getting_started/diffusers_getting_started.md)\n- [Getting started with TensorFlow/Keras optimization](getting_started/tf_getting_started.md)\n- [Getting started with ONNX optimization](getting_started/onnx_getting_started.md)\n\n\n## (Optional) Selective installation of Speedster requirements\n\nBy default, the `auto_installer` installs all the DL frameworks and compilers supported by `Speedster`. However, some of these may not be relevant to your use case. In this section, we explain how you can customize the installation of these libraries, avoiding those that are not needed.\n\nTo customize the libraries installation you have two options:\n\n- [Use the auto-installer (recommended)](#use-the-auto-installer-recommended)\n- [Install the libraries manually](#manual-installation)\n\n### Use the auto-installer (recommended)\nTo understand how to selectively install your preferred libraries, let's examine the auto-installer API:\n\n```bash\npython -m nebullvm.installers.auto_installer \n    --frameworks <frameworks> \n    --extra-backends <backends> \n    --compilers <compilers>\n```\n\n!!! Description\n\n    === \"--frameworks\"\n\n        `frameworks` is used to specify the deep learning framework of your input model. The supported frameworks are `torch`, `tensorflow`, `onnx`, `huggingface` and `diffusers`.\n\n        - if you want to optimize a model with a single DL framework, the code is as follows (example below for HuggingFace):\n            \n            ```python\n            python -m nebullvm.installers.auto_installer --frameworks huggingface\n            ```\n            \n            Please remember that for PyTorch optimization, you should pre-install PyTorch from the official [repo](https://pytorch.org/get-started/locally/).\n                \n        - if you want to optimize models in multiple input frameworks, you can include them separated with a space:\n            ```python\n            python -m nebullvm.installers.auto_installer --frameworks tensorflow torch\n            ```\n\n        - If you want to include all the frameworks, you can use `all` as the argument:\n\n            ```python\n            python -m nebullvm.installers.auto_installer --frameworks all\n            ```\n\n        Default: `all`.\n    \n    === \"--extra-backends\"\n\n        After entering your input model, `Speedster` converts the input model from its original framework into an intermediate framework to be used during the optimization; we call these intermediate frameworks \"backends.\" To learn more, see the section [Model Converter](https://docs.nebuly.com/Speedster/key_concepts/) in the docs. This conversion allows `Speedster` to apply all optimization techniques without being constrained by the input framework of your model.\n            \n        The supported backends are `torch`, `tensorflow` and `onnx`.\n            \n        You can specify multiple backends by separating them with a space. \n            \n        - For example, if you want to install TensorFlow and ONNX as backends of an HugginFace model, the code is as follows:\n            \n            ```python\n            python -m nebullvm.installers.auto_installer --frameworks huggingface --extra-backends tensorflow onnx\n            ```python\n            \n        - If you want to install all the backends supported by the selected frameworks, you can use `all` as the argument.\n        - If you don't want to install extra backends, you can set `--extra-backends none`.\n            \n        The extra-backends that you choose must be compatible with at least one of the input frameworks you previously selected with the argument `—-frameworks`, please see the table below to see the compatibility matrix. \n\n        Default: `all`.    \n\n    === \"--compilers\"\n\n        `compilers` is used to specify the deep learning compilers to be installed. The supported compilers are: `deepsparse`, `tensor_rt`, `torch_tensor_rt`, `openvino` and `intel_neural_compressor`. The compilers must be compatible with at least one of the backends selected with the argument `—-extra-backends`, please see the table below to see the compatibility matrix.\n\n        - You can specify multiple compilers by separating them with a space. For example:\n            \n            ```python\n            --compilers deepsparse tensor_rt\n            ```\n            \n            will install DeepSparse and TensorRT. \n            \n        - If you want to install all the compilers supported by the selected frameworks/backends, you can use `all` as the argument.\n\n        Speedster also supports `torchscript`, `tf_lite`, and `onnxruntime` as built-in; these are preinstalled with their respective backends, so there is no need to include them in the list. Speedster also supports `tvm`, which is currently not supported by the automatic installer and must be installed manually; see the next section if you wish to include it.\n\n        Default: `all`.\n\n\nLet's see an example of how to use these three arguments:\n\n```bash\npython -m nebullvm.installers.auto_installer \n    --frameworks torch \n    --extra-backends all \n    --compilers all\n```\n\nThis command will setup your environment to optimize PyTorch models, and will install all PyTorch supported backends and compilers.\n\nThe following table shows the supported combinations of frameworks, backends and compilers that you can install with the auto-installer:\n\n| Framework    | Extra Backends            | Compilers                                                               |\n|--------------|---------------------------|-------------------------------------------------------------------------|\n| PyTorch      | ONNX                      | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |\n| TensorFlow   | ONNX                      | TensorRT, OpenVINO                                                      |\n| ONNX         | /                         | TensorRT, OpenVINO                                                      |\n| Hugging Face | PyTorch, TensorFlow, ONNX | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |\n| Diffusers    | PyTorch, ONNX             | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |\n\n\n!!! info\n    Hugging Face models can be of two types, PyTorch-based or TensorFlow-based. For PyTorch-based models, it is necessary to include `torch` as an extra-backend. For TensorFlow-based models, you must include `tensorflow` as an extra-backend.\n\n### Manual installation\n\nIf you want to manually install the requirements, this section collects links to the official installation guides for all frameworks and compilers supported by `Speedster`.\n\n#### Deep Learning frameworks/backends\n- PyTorch: https://pytorch.org/get-started/locally/\n- TensorFlow: https://www.tensorflow.org/install\n- ONNX: https://github.com/onnx/onnx#installation\n- HuggingFace: https://huggingface.co/transformers/installation.html\n- Diffusers: https://github.com/huggingface/diffusers#installation\n\n#### Deep Learning compilers\n- DeepSparse: https://github.com/neuralmagic/deepsparse#installation\n- TensorRT: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html\n- Torch TensorRT: https://pytorch.org/TensorRT/getting_started/installation.html#installation\n- ONNXRuntime: https://onnxruntime.ai/docs/install/#python-installs\n- OpenVINO: https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#step-4-install-the-package\n- Intel Neural Compressor: https://github.com/intel/neural-compressor#installation\n- Apache TVM: https://tvm.apache.org/docs/install/index.html\n\n#### Other requirements\n- tf2onnx: https://github.com/onnx/tensorflow-onnx#installation (Install it if you want to convert TensorFlow models to ONNX)\n- polygraphy: https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy#installation (Install it if you want to use TensorRT)\n- onnx-simplifier: https://github.com/daquexian/onnx-simplifier#python-version (Install it if you want to use TensorRT)\n- onnx_graphsurgeon: https://github.com/NVIDIA/TensorRT/tree/master/tools/onnx-graphsurgeon#installation (Install it if you want to use TensorRT with Stable Diffusion)\n- onnxmltools: https://github.com/onnx/onnxmltools#install (Install it if you want to convert models to ONNX)\n\n## (Optional) Download Docker images with frameworks and optimizers\n\nInstead of installing the frameworks and compilers needed for optimization, which can be a time-consuming task, you can simply download a Docker container with all compilers preinstalled.\n\nTo pull up the Docker image, run:\n\n    docker pull nebulydocker/nebullvm:latest\n\nand then run and access the Docker with:\n\n    docker run -ti --gpus=all nebulydocker/nebullvm:latest\n\nAfter optimizing the model, you may decide to deploy it to production. Note that you need to have the deep learning compiler used to optimize the model and other components inside the production Docker. For this reason, we have created several versions of the Docker nebullvm container in the [Docker Hub](https://hub.docker.com/repository/docker/nebulydocker/nebullvm), each containing only one compiler. Pull the image with the compiler that has optimized your model!\n\n## Set up Speedster on custom DL devices\n\nFrom version `0.10.0`, Speedster supports optimization of PyTorch models on `Google TPUs` and `AWS Inferentia` chips. \nFor these devices, the user must ensure that the required libraries are installed on the machine. \nThe following sections describe how to install the required libraries for each device.\n\n### Google TPUs\n\nIn order to use a TPU, you must request a TPU-enabled VM from Google Cloud. You can consult the [official documentation](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en) \nfor more information about how to create a TPU VM and how to get started with PyTorch on TPUs.\n\nTo use Speedster on Google TPUs, we will use the [`torch_xla`](https://github.com/pytorch/xla) library, which is already \npreinstalled in all the Google Cloud TPU VMs, you will find it in the base Python3 environment.\n\nAfter creating the VM, you can follow these steps to set up Speedster:\n- Check that the `torch_xla` library is installed in the base Python3 environment. You can do this by running `python3 -c \"import torch_xla; print(torch_xla.__version__)\"` in the VM console;\n- Set TPU runtime configuration as explained in the [official documentation](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en#set_tpu_runtime_configuration);\n- [Optional] Check that the TPU is working by running the [official example](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en#perform_a_simple_calculation);\n- Install Speedster by running `pip install speedster`. It's not required to install the deep learning compilers in this case, since they are not supported on TPUs.\n\nYou are now ready to use Speedster on TPUs! Speedster will automatically detect the TPU device and will use the `torch_xla` library to optimize the model, comparing its performances with the original model running on the CPU.\n\n### AWS Inferentia\n\nFor AWS Inferentia, you must first create an AWS EC2 instance with the `inf1` instance type. \nYou can find more information about `inf1` instances in the [official documentation](https://aws.amazon.com/it/ec2/instance-types/inf1/).\n\n!!! info\n    AWS has recently released the `inf2` instance type, which is a more powerful version of `inf1`. For now `inf2` \ninstances are only available in private preview, you can request them directly to AWS by filling this [form](https://pages.awscloud.com/EC2-Inf2-Preview.html).\n\nTo use Speedster on AWS Inferentia, we will use the [`torch-neuron`](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-setup.html) library, that must be manually installed on `inf1` instances (on `inf2`instances it's already preinstalled if you use the PyTorch DLAMI provided by AWS).\n\nYou can find here the full guides to set up the EC2 instances and install the required libraries:\n- `inf1`: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/setup/pytorch-install.html#install-neuron-pytorch\n- `inf2`: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/setup/pytorch-install.html#pytorch-neuronx-install\n\nAfter creating the EC2 instance and installing `torch_neuron`, you can follow these steps to set up Speedster:\n- Check that the `torch_neuron` library is installed, you can do this by running `python -c \"import torch_neuron; print(torch_neuron.__version__)\"` in the console (if using `inf1` instances, otherwise change `torch_neuron` with `torch_neuronx`);\n- Install Speedster by running `pip install speedster`. It's not required to install the deep learning compilers in this case, since they are not supported on AWS Inferentia.\n\nYou are now ready to use Speedster on AWS Inferentia! Speedster will automatically detect the AWS Inferentia device and will use the `torch_neuron` library to optimize the model, comparing its performances with the original model running on the CPU.\n"
  },
  {
    "path": "optimization/speedster/docs/en/docs/key_concepts.md",
    "content": "# Key concepts\n\nIn this section we are going to learn the architectural design of the 4 building blocks of `Speedster`.\n\n- [x]  **Converter**: converts the input model from its original framework to the framework backends supported by Speedster, namely PyTorch, TensorFlow, and ONNX. This allows the Compressor and Optimizer modules to apply any optimization technique to the model.\n- [x]  **Compressor**: applies various compression techniques to the model, such as pruning, knowledge distillation, or quantization-aware training.\n- [x]  **Optimizer**: converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The compilers apply both post-training quantization techniques and graph optimizations, to produce compiled binary files.\n- [x]  **Inference Learner**: takes the best performing compiled model and converts it to the same interface as the original input model.\n\n![speedster_blocks](https://user-images.githubusercontent.com/42771598/213177175-a76908a2-5eef-4e82-9d54-0fc812131463.png)\n\nThe **compressor** stage leverages the following open-source projects:\n\n- [Intel/neural-compressor](https://github.com/intel/neural-compressor): targeting to provide unified APIs for network compression technologies, such as low precision quantization, sparsity, pruning, knowledge distillation, across different deep learning frameworks to pursue optimal inference performance.\n- [SparseML](https://github.com/neuralmagic/sparseml): libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models.\n\nThe **compiler stage** leverages the following open-source projects:\n\n- [Apache TVM](https://github.com/apache/tvm): open deep learning compiler stack for cpu, gpu and specialized accelerators.\n- [BladeDISC](https://github.com/alibaba/BladeDISC): end-to-end Dynamic Shape Compiler project for machine learning workloads.\n- [DeepSparse](https://github.com/neuralmagic/deepsparse): neural network inference engine that delivers GPU-class performance for sparsified models on CPUs.\n- [OpenVINO](https://github.com/openvinotoolkit/openvino): open-source toolkit for optimizing and deploying AI inference.\n- [ONNX Runtime](https://github.com/microsoft/onnxruntime): cross-platform, high performance ML inferencing and training accelerator\n- [TensorRT](https://github.com/NVIDIA/TensorRT): C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators.\n- [TFlite](https://github.com/tensorflow/tflite-micro) and [XLA](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): open-source libraries to accelerate TensorFlow models.\n\n## Model converter\n!!! Definition\n    The Converter converts the input model from its original input framework to the framework backends supported by `Speedster`. This conversion enables the Compressor and the Compiler modules to apply all the optimization techniques without being constrained by the framework of your input model.\n\n![image info](images/converter.png)\n\n`Speedster` supports deep learning models in the following input frameworks:\n\n- Hugging Face\n- Diffusers\n- ONNX\n- PyTorch\n- TensorFlow\n\n`Speedster` now includes 3 backends:\n\n- **ONNX backend**, which supports models in any input framework.\n- **PyTorch backend**, which supports input models in PyTorch and ONNX and Hugging Face. \n- **TensorFlow backend**, which supports input models in TensorFlow and ONNX.\n\nAs you notice, to date, not all cross-conversions from input frameworks to each `Speedster` backend are supported. \n\nLet's see a couple of examples to better understand the potenatiality of the Converter block:\n\n1. PyTorch model as input: first of all Speedster will try the compilers available in the PyTorch backend pipeline, then it will convert it to ONNX and will try also the ones available in the ONNX backend optimization pipeline. Finally, the best one among them will be chosen and returned as the optimized model in your input framework (in this case PyTorch).\n\n2. HuggingFace model as input: Let's assume that for your specific use case, the best optimization technique is a specific type of dynamic quantization only supported by PyTorch. If you feed a Hugging Face model into Speedster, the Converter will first transform your model into a PyTorch model. Speedster will then quantize it and finally return it as an Hugging Face model.\n\n## Compressor\n\nThe compressor applies various compression techniques to the model:\n\n- Block-wise un/structured sparsity (🎉 launched in 0.4.0 🎉)\n- Knowledge distillation (to be supported)\n- Layer replacement (to be supported)\n- Low-rank compression (to be supported)\n- Quantization-aware training (to be supported)\n- SparseML (🎉 launched in 0.4.0 🎉)\n\n![image info](images/compressor.png)\n\n## Compiler\n\nThe Compiler block converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The different DL compilers perform both the low-level optimizations, which mostly consist of various quantization techniques, and graph optimizations. Finally, the model is compiled into binary.\n\n![image info](images/compiler.png)\n\nSupported deep learning compilers:\n\n- Apache TVM\n- BladeDISC (🎉 launched in 0.4.0 🎉)\n- DeepSparse (🎉 launched in 0.4.0 🎉)\n- MLIR (open pull request 👩‍💻)\n- ONNX Runtime\n- OpenVINO\n- TensorRT\n- TF Lite / XLA\n- TorchScript\n\nSupported low-level optimizations:\n\n- Static quantization\n- Dynamic quantization\n- Half-precision\n- Low-bit quantization on TVM (to be supported)\n\n## Inference learner\n\nThe Learner, or Inference Learner, selects the most performing compiled model on your hardware and converts it to the same interface as the original input model.\n\n![image info](images/learner.png)"
  },
  {
    "path": "optimization/speedster/docs/en/docs/notebooks.md",
    "content": "# Notebooks\n\nIn this section you can find optimization notebooks for multiple DL input models:\n\n- HuggingFace\n- Diffusers\n- ONNX\n- Pytorch\n- Tensorflow\n\nPlease check out notebooks and tutorials on GitHub at [this](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) link."
  },
  {
    "path": "optimization/speedster/docs/en/docs/overview.md",
    "content": "# Overview\n\n\n`Speedster` is an open-source module designed to accelerate AI inference in just a few lines of code.\nThe library allows you to seamlessy modulate the inference performances of your AI models in terms of latency, throughput, model size, accuracy, cost and automatically applies the best set of optimization techniques along the software to hardware stack to meet your targets.\n\n`Speedster` makes it easy to combine optimization techniques across the whole software to hardware stack, delivering best in class speed-ups. If you like the idea, give us a star to support the project ⭐\n\n![speedster](https://user-images.githubusercontent.com/53374883/225600620-1cd84073-d9b3-43d1-84fa-c3e6c25eb915.png)\n\nThe core `Speedster` workflow consists of 3 steps:\n\n\n- [x]  **Select**: input your model in your preferred DL framework and express your preferences regarding:\n    - Accuracy loss: do you want to trade off a little accuracy for much higher performance?\n    - Optimization time: stellar accelerations can be time-consuming. Can you wait, or do you need an instant answer?\n- [x]  **Search**: the library automatically tests every combination of optimization techniques across the software-to-hardware stack (sparsity, quantization, compilers, etc.) that is compatible with your needs and local hardware.\n- [x]  **Serve**: finally, `Speedster` chooses the best configuration of optimization techniques and returns an accelerated version of your model in the DL framework of your choice (just on steroids 🚀).\n\nNow you are ready to start accelerating your models, visit the [Installation](installation.md) section to start right away!\n"
  },
  {
    "path": "optimization/speedster/docs/en/docs/telemetry.md",
    "content": "# Telemetry\n\n\n`Speedster` is a young and rapidly evolving open-source project. There is plenty of room for improvement for Speedster to make your model achieve the very best performance on your hardware... and you may still find some bugs in the code 🪲\n\nContributions to this OSS project are warmly welcomed 🤗. We encourage you to check out the Contribution guidelines to understand how you can become an active contributor of the source code.\n\n## Sharing feedback to improve Speedster\n\nOpen source is a unique resource for sharing knowledge and building great projects collaboratively with the OSS community. To support the continued development, upon installation of Speedster you could share the information strictly necessary to improve the performance of this open-source project and facilitate bug detection and fixing.\n\nMore specifically, you will foster project enhancement by sharing details of the optimization techniques used with Speedster and the performance achieved on your model and hardware.\n\n**Which data do we collect?**\n\nWe make sure to collect as little data as possible to improve the open-source project:\n\n- basic information about the environment\n- basic information about the optimization\n\nPlease find below an example of telemetry collection:\n\n```python\n{\n\"nebullvm_version\": \"0.6.0\",\n\"app_version\": \"0.0.1\",\n\"model_id\": \"e33a1bbf-fcfd-4f5a-81c9-a9154c7e9343_-7088971112344091114\",\n\"model_metadata\": {\n    \"model_name\": \"ResNet\",\n    \"model_size\": \"102.23 MB\",\n    \"framework\": \"torch\"\n},\n\"hardware_setup\": {\n    \"cpu\": \"Apple M1 Pro\",\n    \"operative_system\": \"Darwin\",\n    \"ram\": \"17.18 GB\"\n},\n\"optimizations\": [\n    {\n        \"compiler\": \"torch\",\n        \"technique\": \"original\",\n        \"latency\": 0.03\n    },\n    {\n        \"compiler\": \"NUMPY_onnxruntime\",\n        \"technique\": \"none\",\n        \"latency\": 0.01\n    }\n],\n\"ip_address\": \"1.1.1.1\"\n}\n```\n\n**How to opt-out?**\n\nYou can simply opt-out from telemetry collection by setting the environment variable `SPEEDSTER_DISABLE_TELEMETRY to 1`.\n\n**Should I opt out?**\n\nBeing open-source, we have very limited visibility into the use of the tool unless someone actively contacts us or opens an issue on GitHub.\n\nWe would appreciate it if you would maintain telemetry, as it helps us improve the source code. In fact, it brings increasing value to the project and helps us to better prioritize feature development.\n\nWe understand that you may still prefer not to share telemetry data and we respect that desire. Please follow the steps above to disable data collection."
  },
  {
    "path": "optimization/speedster/docs/en/mkdocs.yaml",
    "content": "site_name: Speedster\n\ndocs_dir: ./docs\n\nnav:\n  - Overview: overview.md\n  - Installation: installation.md\n  - Getting started:\n    - PyTorch: getting_started/pytorch_getting_started.md\n    - 🤗 HuggingFace: getting_started/hf_getting_started.md\n    - 🧨 Stable Diffusion: getting_started/diffusers_getting_started.md\n    - TensorFlow/Keras: getting_started/tf_getting_started.md\n    - ONNX: getting_started/onnx_getting_started.md\n  - Notebooks: notebooks.md\n  - Key concepts: key_concepts.md\n  - Supported hardware: hardware.md\n  - Advanced options: advanced_options.md\n  - Benchmarks: benchmarks.md\n  - Telemetry: telemetry.md\n"
  },
  {
    "path": "optimization/speedster/notebooks/README.md",
    "content": "# **Jupyter notebooks**\n\nThis folder contains notebooks showing how to use the `Speedster` app to optimize several models. \n\nThe following frameworks are supported:\n- PyTorch\n- HuggingFace\n- Diffusers\n- Tensorflow\n- ONNX\n\nExamples of how to use `Speedster` are shown for each of these frameworks.\n\nIn each folder we provide links to google colab where you can easily test the notebooks. \nIf you want to test them on your own hardware, you can follow the guide below.\n\n## 1. Setup\nTo test notebooks, we have to create an environment where all the required dependencies are installed.\n\nFirst of all, clone the `nebullvm` repository:\n```\ngit clone https://github.com/nebuly-ai/nebullvm.git\n```\nNext, navigate to the repo's root directory:\n```\ncd nebullvm\n```\n\nAfter cloning the repository there are two options: we can either install `Speedster` in a local environment or use a ready-to-use docker container.\n\n### a. Using a local environment\n\nInstall `Speedster` library:\n```\npip install speedster\n```\n\nInstall deep learning compilers:\n```\npython -m nebullvm.installers.auto_installer \\\n    --frameworks all --compilers all\n```\n\nYou can find additional options and details on the official [installation guide](https://docs.nebuly.com/modules/speedster/installation).\n\nAfter everything has been installed, you can start a jupyter session with the following command:\n\n```\njupyter notebook --allow-root --port 8888\n```\nAnd navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888`\n\nUse the token listed in the output from running the jupyter command to log in, for example:\n\n`http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`\n\nYou can finally navigate to the `notebooks/speedster` folder and then to the folder of the framework that you want to try and start a notebook.\n\n\n### b. Using a Docker container\n\nAnother very easy way to test the following notebooks is by using one of the docker containers released on [dockerhub](https://hub.docker.com/r/nebulydocker/nebullvm). \n\n\nPull the most up-to-date container image that has all compilers and their dependencies preinstalled:\n```\ndocker pull nebulydocker/nebullvm:latest\n```\nOnce pulled, the container can be launched with the following command:\n```\ndocker run --rm --gpus all -ti -p 8888:8888 -v $PWD:/nebullvm nebulydocker/nebullvm:latest\n```\nThe `-v` option in the command above allows to persist all the changes that will be done to the notebooks inside the container.\nPlease note that, in order to enable gpu inside docker, you have to ensure that nvidia docker is installed. Please follow the \"Setting up NVIDIA Container Toolkit\" part from the \nofficial [installation guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker).\nYou can then check that the gpu can be seen inside the container by running `nvidia-smi` inside it, and checking that your gpu appears in the output.\n\nInside the container, we can then navigate to the notebooks folder:\n```\ncd /nebullvm/notebooks/speedster\n```\nWe can then run a jupyter session with the following command:\n```\njupyter notebook --allow-root --ip 0.0.0.0 --port 8888\n```\nAnd navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888`\n\nUse the token listed in the output from running the jupyter command to log in, for example:\n\n`http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`\n\nYou can finally navigate to the folder of the framework that you want to try and start a notebook.\n\n## 2. Contributions\nAt Nebuly we are always eager to see how our library manages to optimise more and more models. If you test nebullvm on your model and this is not already present among the notebooks, feel free to open a PR for us to add your notebook to the repository!\n"
  },
  {
    "path": "optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"ef331be9\",\n   \"metadata\": {\n    \"id\": \"ef331be9\"\n   },\n   \"source\": [\n    \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"f260653a\",\n   \"metadata\": {\n    \"id\": \"f260653a\"\n   },\n   \"source\": [\n    \"# Accelerate Stable Diffusion with Speedster\\n\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"8bdf3af5\",\n   \"metadata\": {\n    \"id\": \"8bdf3af5\"\n   },\n   \"source\": [\n    \"Hi and welcome 👋\\n\",\n    \"\\n\",\n    \"In this notebook we will discover how in just a few steps you can speed up the response time of Stable Diffusion inference using the Speedster module from the open-source library nebullvm. In the first section we will try using `Speedster` with the default configuration, then we will explore a more advanced option that involves the TensorRT plugins, that allow to accelerate Stable Diffusion further on GPU.\\n\",\n    \"\\n\",\n    \"Let's jump to the code.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"cXXh1ifQ13mH\",\n   \"metadata\": {\n    \"id\": \"cXXh1ifQ13mH\"\n   },\n   \"source\": [\n    \"# Installation\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"48aljCHu14-H\",\n   \"metadata\": {\n    \"id\": \"48aljCHu14-H\"\n   },\n   \"source\": [\n    \"Install Speedster:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"QFQh3BVr1-GO\",\n   \"metadata\": {\n    \"id\": \"QFQh3BVr1-GO\"\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install speedster\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"8a7a86b3\",\n   \"metadata\": {\n    \"id\": \"8a7a86b3\"\n   },\n   \"source\": [\n    \"Install deep learning compilers:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"cffbfa32\",\n   \"metadata\": {\n    \"id\": \"cffbfa32\"\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m nebullvm.installers.auto_installer --frameworks diffusers --compilers all\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"c2ab3de7\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Environment check (GPU only)\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"61a1a445\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Please skip this section if you don't have a GPU**\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"e2784bb8\",\n   \"metadata\": {},\n   \"source\": [\n    \"If you want to optimize Stable Diffusion on a Nvidia GPU, in order to work properly, the following requirements must be installed on your machine:\\n\",\n    \"- `CUDA>=12.0`\\n\",\n    \"- `tensorrt>=8.6.0`\\n\",\n    \"- `torch<=1.13.1`\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"e3bc8b4d\",\n   \"metadata\": {},\n   \"source\": [\n    \"From TensorRT 8.6, all the tensorrt pre-built wheels released by nvidia support only `CUDA>=12.0`. Speedster will install `tensorrt>=8.6.0` automatically in the auto-installer only if it detects CUDA>=12.0, otherwise it will install `tensorrt==8.5.3.1`. In that case, you will have to upgrade your CUDA version and then to upgarde tensorrt to 8.6.0 or above to execute this notebook.\\n\",\n    \"\\n\",\n    \"There should be a way to run TensorRT 8.6 also with CUDA 11, but it requires installing TensorRT in a different way, you can check this issue: https://github.com/NVIDIA/TensorRT/issues/2773. Otherwise, we highly suggest to just upgrade to CUDA 12.\\n\",\n    \"\\n\",\n    \"For now PyTorch>=2.0.0 is not supported due to an [issue](https://github.com/pytorch/pytorch/issues/97262) in the conversion to onnx, so until they fix it you must have torch<=1.13.1 to optimize Stable Diffusion successfully.\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"ec2267f0\",\n   \"metadata\": {},\n   \"source\": [\n    \"First of all, Let's check the CUDA version installed on the machine\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"82b78585\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import torch\\n\",\n    \"import subprocess\\n\",\n    \"\\n\",\n    \"if torch.cuda.is_available():\\n\",\n    \"    cuda_version = subprocess.check_output([\\\"nvidia-smi\\\"])\\n\",\n    \"    cuda_version = int(cuda_version.decode(\\\"utf-8\\\").split(\\\"\\\\n\\\")[2].split(\\\"|\\\")[-2].split(\\\":\\\")[-1].strip().split(\\\".\\\")[0])\\n\",\n    \"    assert cuda_version >= 12, (\\\"This notebook requires CUDA>=12.0 to be executed, please upgrade your CUDA version.\\\")\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"015cfa92\",\n   \"metadata\": {},\n   \"source\": [\n    \"If you have CUDA<12.0, you can upgrade it at this link: https://developer.nvidia.com/cuda-downloads\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"563779e6\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then, let's check the tensorrt version installed on the platform. Stable Diffusion optimization is supported starting from `tensorrt==8.6.0`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"e385021d\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import tensorrt\\n\",\n    \"from nebullvm.tools.utils import check_module_version\\n\",\n    \"\\n\",\n    \"if torch.cuda.is_available():\\n\",\n    \"    assert check_module_version(tensorrt, \\\"8.6.0\\\"), (\\\"This notebook can be run only with tensorrt>=8.6.0, if using an older version you could have issues during the optimization. Please upgrade your version.\\\")\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"61da505b\",\n   \"metadata\": {},\n   \"source\": [\n    \"If you have an older version, after ensuring you have `CUDA>=12.0` installed, you can upgrade your TensorRT version by running:\\n\",\n    \"```\\n\",\n    \"pip install -U tensorrt\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"3876bea4\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, let's check the PyTorch version\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"db83853f\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import torch\\n\",\n    \"\\n\",\n    \"from nebullvm.tools.utils import check_module_version\\n\",\n    \"\\n\",\n    \"assert check_module_version(torch, max_version=\\\"1.13.1+cu117\\\"), (\\\"This notebook can be run only with torch<=1.13.1, if using an older version you could have issues during the optimization. Please downgrade your version.\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"73072506\",\n   \"metadata\": {\n    \"id\": \"73072506\"\n   },\n   \"source\": [\n    \"## Model and Dataset setup\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"aeb2c521\",\n   \"metadata\": {},\n   \"source\": [\n    \"Once we have ensured that the the required libraries are installed, we have to choose the version of Stable Diffusion we want to optimize, speedster officially supports the most used versions:\\n\",\n    \"- `CompVis/stable-diffusion-v1-4`\\n\",\n    \"- `runwayml/stable-diffusion-v1-5`\\n\",\n    \"- `stabilityai/stable-diffusion-2-1-base`\\n\",\n    \"- `stabilityai/stable-diffusion-2-1` (only on gpus with at least 22GB of Memory, if you want to try with a GPU with a lower memory, you have to uncomment `pipe.enable_attention_slicing()` in the cell below)\\n\",\n    \"\\n\",\n    \"Other Stable Diffusion versions from the Diffusers library should work but have never been tested. If you try a version not included among these and it works, please feel free to report it to us on [Discord](https://discord.com/invite/RbeQMu886J) so we can add it to the list of supported versions. If you try a version that does not work, you can open an issue and possibly a PR on [GitHub](https://github.com/nebuly-ai/nebullvm/issues).\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"e4d55115\",\n   \"metadata\": {\n    \"id\": \"e4d55115\"\n   },\n   \"source\": [\n    \"For this notebook, we are going to select Stable Diffusion 1.4. Let's download and load it using the diffusers API:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"d633cf21\",\n   \"metadata\": {\n    \"id\": \"d633cf21\",\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import torch\\n\",\n    \"from diffusers import StableDiffusionPipeline\\n\",\n    \"\\n\",\n    \"# Select Stable Diffusion version\\n\",\n    \"model_id = \\\"CompVis/stable-diffusion-v1-4\\\"\\n\",\n    \"\\n\",\n    \"device = \\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\"\\n\",\n    \"\\n\",\n    \"if device == \\\"cuda\\\":\\n\",\n    \"    # On GPU we load by default the model in half precision, because it's faster and lighter.\\n\",\n    \"    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\\n\",\n    \"    # pipe.enable_attention_slicing() # Uncomment for stable-diffusion-2.1 on gpus with 16GB of memory like V100-16GB and T4\\n\",\n    \"else:\\n\",\n    \"    pipe = StableDiffusionPipeline.from_pretrained(model_id)\\n\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"11aa0739\",\n   \"metadata\": {\n    \"id\": \"11aa0739\"\n   },\n   \"source\": [\n    \"Let's now create an example dataset with some random sentences, that will be used later for the optimization process\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"cbbfeeb2\",\n   \"metadata\": {\n    \"id\": \"cbbfeeb2\"\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"input_data = [\\n\",\n    \"    \\\"a photo of an astronaut riding a horse on mars\\\",\\n\",\n    \"    \\\"a monkey eating a banana in a forest\\\",\\n\",\n    \"    \\\"white car on a road surrounded by palm trees\\\",\\n\",\n    \"    \\\"a fridge full of bottles of beer\\\",\\n\",\n    \"    \\\"madara uchiha throwing asteroids against people\\\"\\n\",\n    \"]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"17040431\",\n   \"metadata\": {\n    \"id\": \"17040431\"\n   },\n   \"source\": [\n    \"## Speed up inference with Speedster\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"44ddc21d\",\n   \"metadata\": {\n    \"id\": \"44ddc21d\"\n   },\n   \"source\": [\n    \"It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"f9d934f6\",\n   \"metadata\": {\n    \"id\": \"f9d934f6\"\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"from speedster import optimize_model, save_model, load_model\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"2799e3e3\",\n   \"metadata\": {},\n   \"source\": [\n    \"Let's move the pipe back to CPU to save up GPU memory, `Speedster` will automatically move it back to GPU when required.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"45220cf0\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import gc\\n\",\n    \"\\n\",\n    \"# Move the pipe back to cpu\\n\",\n    \"pipe.to(\\\"cpu\\\")\\n\",\n    \"\\n\",\n    \"# Clean memory\\n\",\n    \"torch.cuda.empty_cache()\\n\",\n    \"gc.collect()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"76248033\",\n   \"metadata\": {\n    \"id\": \"76248033\"\n   },\n   \"source\": [\n    \"Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape.\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"75b339c3\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Optimisation of stable diffusion requires a lot of RAM. If you are running this notebook on google colab, make sure to use the high RAM option, otherwise the kernel may crash. If the kernel crashes also when using the high RAM option, please try adding also `\\\"torchscript\\\"` to the `ignore_compilers` list. \\n\",\n    \"If running on GPU, the optimization requires at least 16GB og GPU memory to exploit the best techniques for optimizing the model, otherwise it may fail with a Memory Error**.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"zPC_EDwEJIM0\",\n   \"metadata\": {\n    \"id\": \"zPC_EDwEJIM0\"\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"optimized_model = optimize_model(\\n\",\n    \"    model=pipe,\\n\",\n    \"    input_data=input_data,\\n\",\n    \"    optimization_time=\\\"unconstrained\\\",\\n\",\n    \"    ignore_compilers=[\\\"torch_tensor_rt\\\", \\\"tvm\\\"],  # Some compilers have issues with Stable Diffusion, so it's better to skip them.\\n\",\n    \"    metric_drop_ths=0.2,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"fdae59d2\",\n   \"metadata\": {},\n   \"source\": [\n    \"If running on GPU, here you should obtain a speedup of about 124% on the UNet. We run the optimization on a **3090Ti** and here are our results:\\n\",\n    \"- **Original Model (PyTorch, fp16): 51,557 ms/batch**\\n\",\n    \"- **Optimized Model (TensorRT, fp16): 23,055 ms/batch**\\n\",\n    \"\\n\",\n    \"If the optimized model you obtained is not a TensorRT one, probably there was an error during the optimization. If running on colab, it could happen that the standard gpu is not enough to run the optimization, so we suggest to select a premium gpu with more memory.\\n\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"af9f86ac\",\n   \"metadata\": {},\n   \"source\": [\n    \"If everything worked correctly, let's check the output of the optimized model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"7b640885\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"test_prompt = \\\"futuristic llama with a cyberpunk city on the background\\\"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"fa443637\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"optimized_model(test_prompt).images[0]\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"6e5b3b21\",\n   \"metadata\": {\n    \"id\": \"6e5b3b21\"\n   },\n   \"source\": [\n    \"Let's run the prediction 10 times to calculate the average response time of the original model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"09170c78\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if device == \\\"cuda\\\":\\n\",\n    \"    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\\n\",\n    \"    # pipe.enable_attention_slicing() # Uncomment for stable-diffusion-2.1 on gpus with 16GB of memory like V100-16GB and T4\\n\",\n    \"else:\\n\",\n    \"    pipe = StableDiffusionPipeline.from_pretrained(model_id)\\n\",\n    \"\\n\",\n    \"pipe.to(device)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"d3bc5c98\",\n   \"metadata\": {\n    \"colab\": {\n     \"base_uri\": \"https://localhost:8080/\"\n    },\n    \"id\": \"d3bc5c98\",\n    \"outputId\": \"e0596cf2-fa96-4c50-c012-f5cdab82e681\"\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import time\\n\",\n    \"\\n\",\n    \"times = []\\n\",\n    \"\\n\",\n    \"# Warmup for 2 iterations\\n\",\n    \"for _ in range(2):\\n\",\n    \"    with torch.no_grad():\\n\",\n    \"        final_out = pipe(test_prompt).images[0]\\n\",\n    \"\\n\",\n    \"# Benchmark\\n\",\n    \"for _ in range(8):\\n\",\n    \"    st = time.time()\\n\",\n    \"    with torch.no_grad():\\n\",\n    \"        final_out = pipe(test_prompt).images[0]\\n\",\n    \"    times.append(time.time()-st)\\n\",\n    \"original_model_time = sum(times)/len(times)\\n\",\n    \"print(f\\\"Average response time for original Stable Diffusion 1.4: {original_model_time} s\\\")\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"3db0a7a1\",\n   \"metadata\": {\n    \"id\": \"3db0a7a1\"\n   },\n   \"source\": [\n    \"Let's run the prediction 10 times to calculate the average response time of the optimized model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"a3e83997\",\n   \"metadata\": {\n    \"colab\": {\n     \"base_uri\": \"https://localhost:8080/\"\n    },\n    \"id\": \"a3e83997\",\n    \"outputId\": \"7a416b14-f170-4df9-d416-026f06a7d980\"\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"times = []\\n\",\n    \"\\n\",\n    \"for _ in range(2):\\n\",\n    \"    with torch.no_grad():\\n\",\n    \"        final_out = optimized_model(test_prompt).images[0]\\n\",\n    \"\\n\",\n    \"# Benchmark\\n\",\n    \"for _ in range(8):\\n\",\n    \"    st = time.time()\\n\",\n    \"    with torch.no_grad():\\n\",\n    \"        final_out = optimized_model(test_prompt).images[0]\\n\",\n    \"    times.append(time.time()-st)\\n\",\n    \"optimized_model_time = sum(times)/len(times)\\n\",\n    \"print(f\\\"Average response time for optimized Stable Diffusion 1.4: {optimized_model_time} s\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"ceb60d8c\",\n   \"metadata\": {\n    \"id\": \"ceb60d8c\"\n   },\n   \"source\": [\n    \"## Save and reload the optimized model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"d9eda1a0\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can easily save to disk the optimized model with the following line:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"62b6fcbf\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"save_model(optimized_model, \\\"model_save_path\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"3c968d51\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can then load again the model:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"c1340c49\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"optimized_model = load_model(\\\"model_save_path\\\", pipe=pipe)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"cb234e5e\",\n   \"metadata\": {\n    \"id\": \"cb234e5e\"\n   },\n   \"source\": [\n    \"Great! Was it easy? How are the results? Do you have any comments?\\n\",\n    \"Share your optimization results and thoughts with <a href=\\\"https://discord.gg/RbeQMu886J\\\" target=\\\"_blank\\\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\\n\",\n    \"\\n\",\n    \"Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\\n\",\n    \"\\n\",\n    \"If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"b77ff2ac\",\n   \"metadata\": {\n    \"id\": \"b77ff2ac\"\n   },\n   \"source\": [\n    \"<center> \\n\",\n    \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n    \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n    \"</center>\\n\",\n    \"\\n\",\n    \"<center> \\n\",\n    \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n    \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n    \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n    \"</center>\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"accelerator\": \"GPU\",\n  \"colab\": {\n   \"collapsed_sections\": [],\n   \"provenance\": []\n  },\n  \"gpuClass\": \"premium\",\n  \"kernelspec\": {\n   \"display_name\": \"Python 3 (ipykernel)\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.9.15\"\n  },\n  \"vscode\": {\n   \"interpreter\": {\n    \"hash\": \"4ca44071b2152bc556aa4c839392f76fd4b80aa39d34257f2d304fa0d1d8b7d9\"\n   }\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/diffusers/Readme.md",
    "content": "# **Diffusers Optimization**\n\n> :warning: In order to work properly, the diffusers optimization requires `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`. For additional details, please look the docs [here](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).\n\nThis section contains all the available notebooks that show how to leverage Speedster to optimize Diffusers models.\n\n## Notebooks:\n| Notebook                                                                                                                                                                           | Description                                                                     |                                                                                                                                                                                                                                               |\n|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [Accelerate Diffusers Stable Diffusion](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb) | Show how to optimize with Speedster the Stable Diffusion models from Diffusers. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb) |\n\n## Diffusers API quick view:\n\n``` python\nimport torch\nfrom speedster import optimize_model\nfrom diffusers import StableDiffusionPipeline\n\n\n# Load Stable Diffusion 1.4 as example\nmodel_id = \"CompVis/stable-diffusion-v1-4\"\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\nif device == \"cuda\":\n    # On GPU we load by default the model in half precision, because it's faster and lighter.\n    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\nelse:\n    pipe = StableDiffusionPipeline.from_pretrained(model_id)\n\n# Create some example input data\ninput_data = [\n    \"a photo of an astronaut riding a horse on mars\",\n    \"a monkey eating a banana in a forest\",\n    \"white car on a road surrounded by palm trees\",\n    \"a fridge full of bottles of beer\",\n    \"madara uchiha throwing asteroids against people\"\n]\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n    model=pipe,\n    input_data=input_data,\n    optimization_time=\"unconstrained\",\n    ignore_compilers=[\"torch_tensor_rt\", \"tvm\"],\n    metric_drop_ths=0.1,\n)\n\n# Try the optimized model\ntest_prompt = \"futuristic llama with a cyberpunk city on the background\"\nres = optimized_model(test_prompt).images[0]\n```\n"
  },
  {
    "path": "optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ef331be9\",\n      \"metadata\": {\n        \"id\": \"ef331be9\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"f260653a\",\n      \"metadata\": {\n        \"id\": \"f260653a\"\n      },\n      \"source\": [\n        \"# Accelerate Hugging Face PyTorch BERT with Speedster\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8bdf3af5\",\n      \"metadata\": {\n        \"id\": \"8bdf3af5\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\\n\",\n        \"\\n\",\n        \"With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d527d63b\",\n      \"metadata\": {\n        \"id\": \"d527d63b\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cXXh1ifQ13mH\",\n      \"metadata\": {\n        \"id\": \"cXXh1ifQ13mH\"\n      },\n      \"source\": [\n        \"# Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"48aljCHu14-H\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install Speedster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"QFQh3BVr1-GO\",\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8a7a86b3\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cffbfa32\",\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"73072506\",\n      \"metadata\": {\n        \"id\": \"73072506\"\n      },\n      \"source\": [\n        \"## Model and Dataset setup\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"cf24c4c4\",\n      \"metadata\": {},\n      \"source\": [\n        \"Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"1cf8ff74\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"import os\\n\",\n        \"\\n\",\n        \"tensorrt_path = \\\"/usr/local/lib/python3.8/dist-packages/tensorrt\\\"  # Change this path according to your TensorRT location\\n\",\n        \"\\n\",\n        \"if os.path.exists(tensorrt_path):\\n\",\n        \"    os.environ['LD_LIBRARY_PATH'] += f\\\":{tensorrt_path}\\\"\\n\",\n        \"else:\\n\",\n        \"    print(\\\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"e4d55115\",\n      \"metadata\": {\n        \"id\": \"e4d55115\"\n      },\n      \"source\": [\n        \"We chose BERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d633cf21\",\n      \"metadata\": {\n        \"id\": \"d633cf21\",\n        \"scrolled\": true\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"from transformers import BertTokenizer, BertModel\\n\",\n        \"\\n\",\n        \"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\\n\",\n        \"model = BertModel.from_pretrained('bert-base-uncased', torchscript=True)\\n\",\n        \"\\n\",\n        \"# Move the model to gpu if available and set eval mode\\n\",\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"model.to(device).eval()\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"11aa0739\",\n      \"metadata\": {\n        \"id\": \"11aa0739\"\n      },\n      \"source\": [\n        \"Let's create an example dataset with some random sentences\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cbbfeeb2\",\n      \"metadata\": {\n        \"id\": \"cbbfeeb2\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import random\\n\",\n        \"\\n\",\n        \"sentences = [\\n\",\n        \"    \\\"Mars is the fourth planet from the Sun.\\\",\\n\",\n        \"    \\\"has a crust primarily composed of elements\\\",\\n\",\n        \"    \\\"However, it is unknown\\\",\\n\",\n        \"    \\\"can be viewed from Earth\\\",\\n\",\n        \"    \\\"It was the Romans\\\",\\n\",\n        \"]\\n\",\n        \"\\n\",\n        \"len_dataset = 100\\n\",\n        \"\\n\",\n        \"texts = []\\n\",\n        \"for _ in range(len_dataset):\\n\",\n        \"    n_times = random.randint(1, 30)\\n\",\n        \"    texts.append(\\\" \\\".join(random.choice(sentences) for _ in range(n_times)))\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a09f9424\",\n      \"metadata\": {\n        \"id\": \"a09f9424\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"encoded_inputs = [tokenizer(text, return_tensors=\\\"pt\\\") for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"17040431\",\n      \"metadata\": {\n        \"id\": \"17040431\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: no metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"44ddc21d\",\n      \"metadata\": {\n        \"id\": \"44ddc21d\"\n      },\n      \"source\": [\n        \"It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f9d934f6\",\n      \"metadata\": {\n        \"id\": \"f9d934f6\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from speedster import optimize_model, save_model, load_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"76248033\",\n      \"metadata\": {\n        \"id\": \"76248033\"\n      },\n      \"source\": [\n        \"Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"zPC_EDwEJIM0\",\n      \"metadata\": {\n        \"id\": \"zPC_EDwEJIM0\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"dynamic_info = {\\n\",\n        \"    \\\"inputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"    ],\\n\",\n        \"    \\\"outputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"        {0: 'batch'},\\n\",\n        \"    ]\\n\",\n        \"}\\n\",\n        \"\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"98c6ab09\",\n      \"metadata\": {\n        \"id\": \"98c6ab09\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import time\\n\",\n        \"\\n\",\n        \"# Move inputs to gpu if available\\n\",\n        \"encoded_inputs = [tokenizer(text, return_tensors=\\\"pt\\\").to(device) for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"6e5b3b21\",\n      \"metadata\": {\n        \"id\": \"6e5b3b21\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the original model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d3bc5c98\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"d3bc5c98\",\n        \"outputId\": \"e0596cf2-fa96-4c50-c012-f5cdab82e681\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original DistilBERT: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"12c2df98\",\n      \"metadata\": {\n        \"id\": \"12c2df98\"\n      },\n      \"source\": [\n        \"Let's see the output of the original model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"4892a905\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"4892a905\",\n        \"outputId\": \"68d9b65f-e2cc-4998-8047-c9091f977698\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"3db0a7a1\",\n      \"metadata\": {\n        \"id\": \"3db0a7a1\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the optimized model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a3e83997\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"a3e83997\",\n        \"outputId\": \"7a416b14-f170-4df9-d416-026f06a7d980\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized BERT (no metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"0d884d61\",\n      \"metadata\": {\n        \"id\": \"0d884d61\"\n      },\n      \"source\": [\n        \"Let's see the output of the optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"75611b2e\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"75611b2e\",\n        \"outputId\": \"035d5c6d-fd7a-4506-af09-befcf9dd3b2d\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"7b1950d5\",\n      \"metadata\": {\n        \"id\": \"7b1950d5\"\n      },\n      \"source\": [\n        \"This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"de5721d8\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"de5721d8\",\n        \"outputId\": \"c9efff21-f963-47ff-e83d-a44615f90a10\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \"    metric_drop_ths=0.1,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"0fbfe6fa\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"0fbfe6fa\",\n        \"outputId\": \"ada293f5-9b54-4186-8e48-74b994d4b797\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original BERT: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f89b7e6d\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"f89b7e6d\",\n        \"outputId\": \"51e497e1-a533-432d-d68e-b373f0ef69cb\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"10d17b5c\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"10d17b5c\",\n        \"outputId\": \"d5dc0acd-77e7-4054-b455-19343ff37951\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized BERT (metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"6bf3d1fb\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"6bf3d1fb\",\n        \"outputId\": \"6163d8ba-254f-47d2-a468-a921622a15ba\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d9eda1a0\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"62b6fcbf\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c968d51\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"c1340c49\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cb234e5e\",\n      \"metadata\": {\n        \"id\": \"cb234e5e\"\n      },\n      \"source\": [\n        \"Great! Was it easy? How are the results? Do you have any comments?\\n\",\n        \"Share your optimization results and thoughts with <a href=\\\"https://discord.gg/RbeQMu886J\\\" target=\\\"_blank\\\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\\n\",\n        \"\\n\",\n        \"Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\\n\",\n        \"\\n\",\n        \"If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord.\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"collapsed_sections\": [],\n      \"provenance\": []\n    },\n    \"gpuClass\": \"premium\",\n    \"kernelspec\": {\n      \"display_name\": \"nebullvm_new\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.9.15\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"4fbc45cd27f7d363500c2e8640d9fdb717da4e1d8e4954a68e42b53d65ee27af\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ef331be9\",\n      \"metadata\": {\n        \"id\": \"ef331be9\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"f260653a\",\n      \"metadata\": {\n        \"id\": \"f260653a\"\n      },\n      \"source\": [\n        \"# Accelerate Hugging Face PyTorch DistilBERT with Speedster\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8bdf3af5\",\n      \"metadata\": {\n        \"id\": \"8bdf3af5\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\\n\",\n        \"\\n\",\n        \"With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d527d63b\",\n      \"metadata\": {\n        \"id\": \"d527d63b\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cXXh1ifQ13mH\",\n      \"metadata\": {\n        \"id\": \"cXXh1ifQ13mH\"\n      },\n      \"source\": [\n        \"# Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"48aljCHu14-H\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install Speedster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"QFQh3BVr1-GO\",\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8a7a86b3\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cffbfa32\",\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"73072506\",\n      \"metadata\": {\n        \"id\": \"73072506\"\n      },\n      \"source\": [\n        \"## Model and Dataset setup\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"cf24c4c4\",\n      \"metadata\": {},\n      \"source\": [\n        \"Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"1cf8ff74\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"import os\\n\",\n        \"\\n\",\n        \"tensorrt_path = \\\"/usr/local/lib/python3.8/dist-packages/tensorrt\\\"  # Change this path according to your TensorRT location\\n\",\n        \"\\n\",\n        \"if os.path.exists(tensorrt_path):\\n\",\n        \"    os.environ['LD_LIBRARY_PATH'] += f\\\":{tensorrt_path}\\\"\\n\",\n        \"else:\\n\",\n        \"    print(\\\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"e4d55115\",\n      \"metadata\": {\n        \"id\": \"e4d55115\"\n      },\n      \"source\": [\n        \"We chose DistilBERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d633cf21\",\n      \"metadata\": {\n        \"id\": \"d633cf21\",\n        \"scrolled\": true\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"from transformers import DistilBertTokenizer, DistilBertModel\\n\",\n        \"\\n\",\n        \"tokenizer = DistilBertTokenizer.from_pretrained(\\\"distilbert-base-uncased\\\")\\n\",\n        \"model = DistilBertModel.from_pretrained(\\\"distilbert-base-uncased\\\", torchscript=True)\\n\",\n        \"\\n\",\n        \"# Move the model to gpu if available and set eval mode\\n\",\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"model.to(device).eval()\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"11aa0739\",\n      \"metadata\": {\n        \"id\": \"11aa0739\"\n      },\n      \"source\": [\n        \"Let's create an example dataset with some random sentences\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cbbfeeb2\",\n      \"metadata\": {\n        \"id\": \"cbbfeeb2\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import random\\n\",\n        \"\\n\",\n        \"sentences = [\\n\",\n        \"    \\\"Mars is the fourth planet from the Sun.\\\",\\n\",\n        \"    \\\"has a crust primarily composed of elements\\\",\\n\",\n        \"    \\\"However, it is unknown\\\",\\n\",\n        \"    \\\"can be viewed from Earth\\\",\\n\",\n        \"    \\\"It was the Romans\\\",\\n\",\n        \"]\\n\",\n        \"\\n\",\n        \"len_dataset = 100\\n\",\n        \"\\n\",\n        \"texts = []\\n\",\n        \"for _ in range(len_dataset):\\n\",\n        \"    n_times = random.randint(1, 30)\\n\",\n        \"    texts.append(\\\" \\\".join(random.choice(sentences) for _ in range(n_times)))\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a09f9424\",\n      \"metadata\": {\n        \"id\": \"a09f9424\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"encoded_inputs = [tokenizer(text, return_tensors=\\\"pt\\\") for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"17040431\",\n      \"metadata\": {\n        \"id\": \"17040431\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: no metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"44ddc21d\",\n      \"metadata\": {\n        \"id\": \"44ddc21d\"\n      },\n      \"source\": [\n        \"It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f9d934f6\",\n      \"metadata\": {\n        \"id\": \"f9d934f6\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from speedster import optimize_model, save_model, load_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"76248033\",\n      \"metadata\": {\n        \"id\": \"76248033\"\n      },\n      \"source\": [\n        \"Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"zPC_EDwEJIM0\",\n      \"metadata\": {\n        \"id\": \"zPC_EDwEJIM0\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"dynamic_info = {\\n\",\n        \"    \\\"inputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'}\\n\",\n        \"    ],\\n\",\n        \"    \\\"outputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'}\\n\",\n        \"    ]\\n\",\n        \"}\\n\",\n        \"\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"98c6ab09\",\n      \"metadata\": {\n        \"id\": \"98c6ab09\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import time\\n\",\n        \"\\n\",\n        \"# Move inputs to gpu if available\\n\",\n        \"encoded_inputs = [tokenizer(text, return_tensors=\\\"pt\\\").to(device) for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"6e5b3b21\",\n      \"metadata\": {\n        \"id\": \"6e5b3b21\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the original model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d3bc5c98\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"d3bc5c98\",\n        \"outputId\": \"e0596cf2-fa96-4c50-c012-f5cdab82e681\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original DistilBERT: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"12c2df98\",\n      \"metadata\": {\n        \"id\": \"12c2df98\"\n      },\n      \"source\": [\n        \"Let's see the output of the original model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"4892a905\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"4892a905\",\n        \"outputId\": \"68d9b65f-e2cc-4998-8047-c9091f977698\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"3db0a7a1\",\n      \"metadata\": {\n        \"id\": \"3db0a7a1\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the optimized model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a3e83997\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"a3e83997\",\n        \"outputId\": \"7a416b14-f170-4df9-d416-026f06a7d980\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized DistilBERT (no metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"0d884d61\",\n      \"metadata\": {\n        \"id\": \"0d884d61\"\n      },\n      \"source\": [\n        \"Let's see the output of the optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"75611b2e\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"75611b2e\",\n        \"outputId\": \"035d5c6d-fd7a-4506-af09-befcf9dd3b2d\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"7b1950d5\",\n      \"metadata\": {\n        \"id\": \"7b1950d5\"\n      },\n      \"source\": [\n        \"This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"de5721d8\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"de5721d8\",\n        \"outputId\": \"c9efff21-f963-47ff-e83d-a44615f90a10\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \"    metric_drop_ths=0.1,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"0fbfe6fa\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"0fbfe6fa\",\n        \"outputId\": \"ada293f5-9b54-4186-8e48-74b994d4b797\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original DistilBERT: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f89b7e6d\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"f89b7e6d\",\n        \"outputId\": \"51e497e1-a533-432d-d68e-b373f0ef69cb\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"10d17b5c\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"10d17b5c\",\n        \"outputId\": \"d5dc0acd-77e7-4054-b455-19343ff37951\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized DistilBERT (metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"6bf3d1fb\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"6bf3d1fb\",\n        \"outputId\": \"6163d8ba-254f-47d2-a468-a921622a15ba\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d9eda1a0\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"62b6fcbf\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c968d51\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"c1340c49\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cb234e5e\",\n      \"metadata\": {\n        \"id\": \"cb234e5e\"\n      },\n      \"source\": [\n        \"Great! Was it easy? How are the results? Do you have any comments?\\n\",\n        \"Share your optimization results and thoughts with <a href=\\\"https://discord.gg/RbeQMu886J\\\" target=\\\"_blank\\\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\\n\",\n        \"\\n\",\n        \"Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\\n\",\n        \"\\n\",\n        \"If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord.\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"collapsed_sections\": [],\n      \"provenance\": []\n    },\n    \"gpuClass\": \"premium\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3.8.10 64-bit\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.8.9 (default, Apr 13 2022, 08:48:06) \\n[Clang 13.1.6 (clang-1316.0.21.2.5)]\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ef331be9\",\n      \"metadata\": {\n        \"id\": \"ef331be9\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"f260653a\",\n      \"metadata\": {\n        \"id\": \"f260653a\"\n      },\n      \"source\": [\n        \"# Accelerate Hugging Face PyTorch GPT2 with Speedster\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8bdf3af5\",\n      \"metadata\": {\n        \"id\": \"8bdf3af5\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\\n\",\n        \"\\n\",\n        \"With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d527d63b\",\n      \"metadata\": {\n        \"id\": \"d527d63b\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cXXh1ifQ13mH\",\n      \"metadata\": {\n        \"id\": \"cXXh1ifQ13mH\"\n      },\n      \"source\": [\n        \"# Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"48aljCHu14-H\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install Speedster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"QFQh3BVr1-GO\",\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8a7a86b3\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cffbfa32\",\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"73072506\",\n      \"metadata\": {\n        \"id\": \"73072506\"\n      },\n      \"source\": [\n        \"## Model and Dataset setup\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"cf24c4c4\",\n      \"metadata\": {},\n      \"source\": [\n        \"Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"1cf8ff74\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"import os\\n\",\n        \"\\n\",\n        \"tensorrt_path = \\\"/usr/local/lib/python3.8/dist-packages/tensorrt\\\"  # Change this path according to your TensorRT location\\n\",\n        \"\\n\",\n        \"if os.path.exists(tensorrt_path):\\n\",\n        \"    os.environ['LD_LIBRARY_PATH'] += f\\\":{tensorrt_path}\\\"\\n\",\n        \"else:\\n\",\n        \"    print(\\\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"e4d55115\",\n      \"metadata\": {\n        \"id\": \"e4d55115\"\n      },\n      \"source\": [\n        \"We chose GPT2 as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d633cf21\",\n      \"metadata\": {\n        \"colab\": {\n          \"background_save\": true\n        },\n        \"id\": \"d633cf21\",\n        \"scrolled\": true\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"from transformers import GPT2Tokenizer, GPT2Model\\n\",\n        \"\\n\",\n        \"tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\\n\",\n        \"model = GPT2Model.from_pretrained('gpt2', torchscript=True)\\n\",\n        \"\\n\",\n        \"# Move the model to gpu if available and set eval mode\\n\",\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"model.to(device).eval()\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"11aa0739\",\n      \"metadata\": {\n        \"id\": \"11aa0739\"\n      },\n      \"source\": [\n        \"Let's create an example dataset with some random sentences\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cbbfeeb2\",\n      \"metadata\": {\n        \"colab\": {\n          \"background_save\": true\n        },\n        \"id\": \"cbbfeeb2\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import random\\n\",\n        \"\\n\",\n        \"sentences = [\\n\",\n        \"    \\\"Mars is the fourth planet from the Sun.\\\",\\n\",\n        \"    \\\"has a crust primarily composed of elements\\\",\\n\",\n        \"    \\\"However, it is unknown\\\",\\n\",\n        \"    \\\"can be viewed from Earth\\\",\\n\",\n        \"    \\\"It was the Romans\\\",\\n\",\n        \"]\\n\",\n        \"\\n\",\n        \"len_dataset = 100\\n\",\n        \"\\n\",\n        \"texts = []\\n\",\n        \"for _ in range(len_dataset):\\n\",\n        \"    n_times = random.randint(1, 30)\\n\",\n        \"    texts.append(\\\" \\\".join(random.choice(sentences) for _ in range(n_times)))\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a09f9424\",\n      \"metadata\": {\n        \"colab\": {\n          \"background_save\": true\n        },\n        \"id\": \"a09f9424\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"encoded_inputs = [tokenizer(text, return_tensors=\\\"pt\\\") for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"17040431\",\n      \"metadata\": {\n        \"id\": \"17040431\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: no metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"44ddc21d\",\n      \"metadata\": {\n        \"id\": \"44ddc21d\"\n      },\n      \"source\": [\n        \"It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f9d934f6\",\n      \"metadata\": {\n        \"id\": \"f9d934f6\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from speedster import optimize_model, save_model, load_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"76248033\",\n      \"metadata\": {\n        \"id\": \"76248033\"\n      },\n      \"source\": [\n        \"Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"zPC_EDwEJIM0\",\n      \"metadata\": {\n        \"id\": \"zPC_EDwEJIM0\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"dynamic_info = {\\n\",\n        \"    \\\"inputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'}\\n\",\n        \"    ],\\n\",\n        \"    \\\"outputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"    ] + [{0: 'batch', 2: 'num_tokens'} for i in range(24)]\\n\",\n        \"}\\n\",\n        \"\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"98c6ab09\",\n      \"metadata\": {\n        \"id\": \"98c6ab09\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import time\\n\",\n        \"\\n\",\n        \"# Move inputs to gpu if available\\n\",\n        \"encoded_inputs = [tokenizer(text, return_tensors=\\\"pt\\\").to(device) for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"6e5b3b21\",\n      \"metadata\": {\n        \"id\": \"6e5b3b21\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the original model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d3bc5c98\",\n      \"metadata\": {\n        \"id\": \"d3bc5c98\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original GPT2: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"12c2df98\",\n      \"metadata\": {\n        \"id\": \"12c2df98\"\n      },\n      \"source\": [\n        \"Let's see the output of the original model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"4892a905\",\n      \"metadata\": {\n        \"id\": \"4892a905\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"3db0a7a1\",\n      \"metadata\": {\n        \"id\": \"3db0a7a1\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the optimized model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a3e83997\",\n      \"metadata\": {\n        \"id\": \"a3e83997\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized GPT2 (no metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"0d884d61\",\n      \"metadata\": {\n        \"id\": \"0d884d61\"\n      },\n      \"source\": [\n        \"Let's see the output of the optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"75611b2e\",\n      \"metadata\": {\n        \"id\": \"75611b2e\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"7b1950d5\",\n      \"metadata\": {\n        \"id\": \"7b1950d5\"\n      },\n      \"source\": [\n        \"This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"de5721d8\",\n      \"metadata\": {\n        \"id\": \"de5721d8\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \"    metric_drop_ths=0.1,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"0fbfe6fa\",\n      \"metadata\": {\n        \"id\": \"0fbfe6fa\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original GPT2: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f89b7e6d\",\n      \"metadata\": {\n        \"id\": \"f89b7e6d\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"10d17b5c\",\n      \"metadata\": {\n        \"id\": \"10d17b5c\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        final_out = optimized_model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized GPT2 (metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"6bf3d1fb\",\n      \"metadata\": {\n        \"id\": \"6bf3d1fb\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d9eda1a0\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"62b6fcbf\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c968d51\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"c1340c49\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cb234e5e\",\n      \"metadata\": {\n        \"id\": \"cb234e5e\"\n      },\n      \"source\": [\n        \"Great! Was it easy? How are the results? Do you have any comments?\\n\",\n        \"Share your optimization results and thoughts with <a href=\\\"https://discord.gg/RbeQMu886J\\\" target=\\\"_blank\\\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\\n\",\n        \"\\n\",\n        \"Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\\n\",\n        \"\\n\",\n        \"If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord.\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"collapsed_sections\": [],\n      \"provenance\": []\n    },\n    \"gpuClass\": \"premium\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3.8.10 64-bit\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.8.9 (default, Apr 13 2022, 08:48:06) \\n[Clang 13.1.6 (clang-1316.0.21.2.5)]\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ef331be9\",\n      \"metadata\": {\n        \"id\": \"ef331be9\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"f260653a\",\n      \"metadata\": {\n        \"id\": \"f260653a\"\n      },\n      \"source\": [\n        \"# Accelerate Hugging Face T5 with Speedster\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8bdf3af5\",\n      \"metadata\": {\n        \"id\": \"8bdf3af5\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\\n\",\n        \"\\n\",\n        \"With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d527d63b\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"d527d63b\",\n        \"outputId\": \"57626bac-e458-487f-f4fa-a459627af296\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cXXh1ifQ13mH\",\n      \"metadata\": {\n        \"id\": \"cXXh1ifQ13mH\"\n      },\n      \"source\": [\n        \"# Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"48aljCHu14-H\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install Speedster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"QFQh3BVr1-GO\",\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8a7a86b3\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cffbfa32\",\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"73072506\",\n      \"metadata\": {\n        \"id\": \"73072506\"\n      },\n      \"source\": [\n        \"## Model and Dataset setup\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"cf24c4c4\",\n      \"metadata\": {},\n      \"source\": [\n        \"Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"1cf8ff74\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"import os\\n\",\n        \"\\n\",\n        \"tensorrt_path = \\\"/usr/local/lib/python3.8/dist-packages/tensorrt\\\"  # Change this path according to your TensorRT location\\n\",\n        \"\\n\",\n        \"if os.path.exists(tensorrt_path):\\n\",\n        \"    os.environ['LD_LIBRARY_PATH'] += f\\\":{tensorrt_path}\\\"\\n\",\n        \"else:\\n\",\n        \"    print(\\\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"e4d55115\",\n      \"metadata\": {\n        \"id\": \"e4d55115\"\n      },\n      \"source\": [\n        \"We chose T5-efficient-base as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"NOgOmfdY_dav\",\n      \"metadata\": {\n        \"id\": \"NOgOmfdY_dav\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\\n\",\n        \"import torch\\n\",\n        \"\\n\",\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"\\n\",\n        \"model_name = \\\"google/t5-efficient-base\\\"\\n\",\n        \"\\n\",\n        \"tokenizer = AutoTokenizer.from_pretrained(model_name)\\n\",\n        \"model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torchscript=True).to(device)\\n\",\n        \"\\n\",\n        \"# set the model to eval mode\\n\",\n        \"_ = model.eval()\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"11aa0739\",\n      \"metadata\": {\n        \"id\": \"11aa0739\"\n      },\n      \"source\": [\n        \"Let's create an example dataset with some random sentences\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"ghGcDNFtKt3X\",\n      \"metadata\": {\n        \"id\": \"ghGcDNFtKt3X\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"texts = [\\n\",\n        \"    \\\"\\\"\\\"BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.\\\"\\\"\\\",\\n\",\n        \"    \\\"\\\"\\\"GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was trained to guess the next word in sentences.\\\"\\\"\\\",\\n\",\n        \"    \\\"\\\"\\\"With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task.\\\"\\\"\\\",\\n\",\n        \"    \\\"\\\"\\\"LayoutLMv3 is a pre-trained multimodal Transformer for Document AI with unified text and image masking. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model. For example, LayoutLMv3 can be fine-tuned for both text-centric tasks, including form understanding, receipt understanding, and document visual question answering, and image-centric tasks such as document image classification and document layout analysis.\\\"\\\"\\\",\\n\",\n        \"    \\\"\\\"\\\"XLNet is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs Transformer-XL as the backbone model, exhibiting excellent performance for language tasks involving long context. Overall, XLNet achieves state-of-the-art (SOTA) results on various downstream language tasks including question answering, natural language inference, sentiment analysis, and document ranking.\\\"\\\"\\\"\\n\",\n        \"]\\n\",\n        \"texts = texts*20\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a09f9424\",\n      \"metadata\": {\n        \"id\": \"a09f9424\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"encoded_inputs = [tokenizer(text, padding=\\\"longest\\\", return_tensors=\\\"pt\\\") for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"17040431\",\n      \"metadata\": {\n        \"id\": \"17040431\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: no metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"44ddc21d\",\n      \"metadata\": {\n        \"id\": \"44ddc21d\"\n      },\n      \"source\": [\n        \"It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f9d934f6\",\n      \"metadata\": {\n        \"id\": \"f9d934f6\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from speedster import optimize_model, save_model, load_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"76248033\",\n      \"metadata\": {\n        \"id\": \"76248033\"\n      },\n      \"source\": [\n        \"Usually Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. But for this type of models, we need to do some extra steps because current version of speedster don't have direct support for Encoder-Decoder Models. These type of models has both Encoder and Decoder. For Example, BERT models are Encoder models and GPT models are Decoder models, but T5 has both.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"i7sgUWjePN9i\",\n      \"metadata\": {\n        \"id\": \"i7sgUWjePN9i\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# First, we get the encoder and decoder from the model\\n\",\n        \"encoder = model.get_encoder()\\n\",\n        \"decoder = model.get_decoder()\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"O7xaI1drQOQ0\",\n      \"metadata\": {\n        \"id\": \"O7xaI1drQOQ0\"\n      },\n      \"source\": [\n        \"Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"nTUPdDchQLc1\",\n      \"metadata\": {\n        \"id\": \"nTUPdDchQLc1\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"dynamic_info = {\\n\",\n        \"    \\\"inputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'}\\n\",\n        \"    ],\\n\",\n        \"    \\\"outputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"    ]\\n\",\n        \"}\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"zPC_EDwEJIM0\",\n      \"metadata\": {\n        \"id\": \"zPC_EDwEJIM0\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Create the optimized encoder model seperately\\n\",\n        \"optimized_encoder_model = optimize_model(\\n\",\n        \"    model=encoder,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"7Oa68a87Qjre\",\n      \"metadata\": {\n        \"id\": \"7Oa68a87Qjre\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Create the optimized decoder model seperately\\n\",\n        \"optimized_decoder_model = optimize_model(\\n\",\n        \"    model=decoder,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"98c6ab09\",\n      \"metadata\": {\n        \"id\": \"98c6ab09\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import time\\n\",\n        \"\\n\",\n        \"# Move inputs to gpu if available\\n\",\n        \"encoded_inputs = [tokenizer(text, padding=\\\"longest\\\", return_tensors=\\\"pt\\\").to(device) for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"6e5b3b21\",\n      \"metadata\": {\n        \"id\": \"6e5b3b21\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the original model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d3bc5c98\",\n      \"metadata\": {\n        \"id\": \"d3bc5c98\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        encoder_out = encoder(**encoded_input)\\n\",\n        \"        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        encoder_out = encoder(**encoded_input)\\n\",\n        \"        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original T5: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"GU0SwykMTVAj\",\n      \"metadata\": {\n        \"id\": \"GU0SwykMTVAj\"\n      },\n      \"source\": [\n        \"In Real world use cases, we pass the decoder output to `model.lm_head` to get the actual prediction, but here we are testing the performance improvements, so i am skipping that step.\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"12c2df98\",\n      \"metadata\": {\n        \"id\": \"12c2df98\"\n      },\n      \"source\": [\n        \"Let's see the output of the original model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"4892a905\",\n      \"metadata\": {\n        \"id\": \"4892a905\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"encoder(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"gx0naPVuSVrm\",\n      \"metadata\": {\n        \"id\": \"gx0naPVuSVrm\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"3db0a7a1\",\n      \"metadata\": {\n        \"id\": \"3db0a7a1\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the optimized model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a3e83997\",\n      \"metadata\": {\n        \"id\": \"a3e83997\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        encoder_out = optimized_encoder_model(**encoded_input)\\n\",\n        \"        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        encoder_out = optimized_encoder_model(**encoded_input)\\n\",\n        \"        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized T5 (no metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"0d884d61\",\n      \"metadata\": {\n        \"id\": \"0d884d61\"\n      },\n      \"source\": [\n        \"Let's see the output of the optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"75611b2e\",\n      \"metadata\": {\n        \"id\": \"75611b2e\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_encoder_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cpieoDfwS-V7\",\n      \"metadata\": {\n        \"id\": \"cpieoDfwS-V7\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"7b1950d5\",\n      \"metadata\": {\n        \"id\": \"7b1950d5\"\n      },\n      \"source\": [\n        \"This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"VwOLWZSZUM89\",\n      \"metadata\": {\n        \"id\": \"VwOLWZSZUM89\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_encoder_model = optimize_model(\\n\",\n        \"    model=encoder,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \"    metric_drop_ths=0.1,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"FIKn4V3dUIZB\",\n      \"metadata\": {\n        \"id\": \"FIKn4V3dUIZB\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_decoder_model = optimize_model(\\n\",\n        \"    model=decoder,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tensor_rt\\\", \\\"tvm\\\"],  # TensorRT does not work for this model\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \"    metric_drop_ths=0.1,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"0fbfe6fa\",\n      \"metadata\": {\n        \"id\": \"0fbfe6fa\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        encoder_out = encoder(**encoded_input)\\n\",\n        \"        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        encoder_out = encoder(**encoded_input)\\n\",\n        \"        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original T5: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f89b7e6d\",\n      \"metadata\": {\n        \"id\": \"f89b7e6d\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"encoder(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"oI1zjIBSUoIU\",\n      \"metadata\": {\n        \"id\": \"oI1zjIBSUoIU\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"10d17b5c\",\n      \"metadata\": {\n        \"id\": \"10d17b5c\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        encoder_out = optimized_encoder_model(**encoded_input)\\n\",\n        \"        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    with torch.no_grad():\\n\",\n        \"        encoder_out = optimized_encoder_model(**encoded_input)\\n\",\n        \"        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized T5 (metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"4XFMC1S6zXTU\",\n      \"metadata\": {\n        \"id\": \"4XFMC1S6zXTU\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"OXHVr3EAzbT5\",\n      \"metadata\": {\n        \"id\": \"OXHVr3EAzbT5\"\n      },\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"3M565P-zzaFB\",\n      \"metadata\": {\n        \"id\": \"3M565P-zzaFB\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_encoder_model, \\\"encoder_model_save_path\\\")\\n\",\n        \"save_model(optimized_decoder_model, \\\"decoder_model_save_path\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ee8CS_Evzg1j\",\n      \"metadata\": {\n        \"id\": \"ee8CS_Evzg1j\"\n      },\n      \"source\": [\n        \"We can then load again the model:\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"zOQ88SY_zg-A\",\n      \"metadata\": {\n        \"id\": \"zOQ88SY_zg-A\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_encoder_model = load_model(\\\"encoder_model_save_path\\\")\\n\",\n        \"optimized_decoder_model = load_model(\\\"decoder_model_save_path\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cb234e5e\",\n      \"metadata\": {\n        \"id\": \"cb234e5e\"\n      },\n      \"source\": [\n        \"Great! Was it easy? How are the results? Do you have any comments?\\n\",\n        \"Share your optimization results and thoughts with <a href=\\\"https://discord.gg/RbeQMu886J\\\" target=\\\"_blank\\\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\\n\",\n        \"\\n\",\n        \"Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\\n\",\n        \"\\n\",\n        \"If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord.\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"provenance\": []\n    },\n    \"gpuClass\": \"premium\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ef331be9\",\n      \"metadata\": {\n        \"id\": \"ef331be9\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"f260653a\",\n      \"metadata\": {\n        \"id\": \"f260653a\"\n      },\n      \"source\": [\n        \"# Accelerate Hugging Face TensorFlow BERT with Speedster\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8bdf3af5\",\n      \"metadata\": {\n        \"id\": \"8bdf3af5\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\\n\",\n        \"\\n\",\n        \"With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d527d63b\",\n      \"metadata\": {\n        \"id\": \"d527d63b\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cXXh1ifQ13mH\",\n      \"metadata\": {\n        \"id\": \"cXXh1ifQ13mH\"\n      },\n      \"source\": [\n        \"# Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"48aljCHu14-H\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install Speedster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"QFQh3BVr1-GO\",\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8a7a86b3\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cffbfa32\",\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"73072506\",\n      \"metadata\": {\n        \"id\": \"73072506\"\n      },\n      \"source\": [\n        \"## Model and Dataset setup\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"cf24c4c4\",\n      \"metadata\": {},\n      \"source\": [\n        \"Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"1cf8ff74\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"import os\\n\",\n        \"\\n\",\n        \"tensorrt_path = \\\"/usr/local/lib/python3.8/dist-packages/tensorrt\\\"  # Change this path according to your TensorRT location\\n\",\n        \"\\n\",\n        \"if os.path.exists(tensorrt_path):\\n\",\n        \"    os.environ['LD_LIBRARY_PATH'] += f\\\":{tensorrt_path}\\\"\\n\",\n        \"else:\\n\",\n        \"    print(\\\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"e4d55115\",\n      \"metadata\": {\n        \"id\": \"e4d55115\"\n      },\n      \"source\": [\n        \"We chose BERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d633cf21\",\n      \"metadata\": {\n        \"id\": \"d633cf21\",\n        \"scrolled\": true\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from transformers import BertTokenizer, TFBertModel\\n\",\n        \"\\n\",\n        \"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\\n\",\n        \"model = TFBertModel.from_pretrained('bert-base-uncased')\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"11aa0739\",\n      \"metadata\": {\n        \"id\": \"11aa0739\"\n      },\n      \"source\": [\n        \"Let's create an example dataset with some random sentences\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cbbfeeb2\",\n      \"metadata\": {\n        \"id\": \"cbbfeeb2\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import random\\n\",\n        \"\\n\",\n        \"sentences = [\\n\",\n        \"    \\\"Mars is the fourth planet from the Sun.\\\",\\n\",\n        \"    \\\"has a crust primarily composed of elements\\\",\\n\",\n        \"    \\\"However, it is unknown\\\",\\n\",\n        \"    \\\"can be viewed from Earth\\\",\\n\",\n        \"    \\\"It was the Romans\\\",\\n\",\n        \"]\\n\",\n        \"\\n\",\n        \"len_dataset = 100\\n\",\n        \"\\n\",\n        \"texts = []\\n\",\n        \"for _ in range(len_dataset):\\n\",\n        \"    n_times = random.randint(1, 30)\\n\",\n        \"    texts.append(\\\" \\\".join(random.choice(sentences) for _ in range(n_times)))\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a09f9424\",\n      \"metadata\": {\n        \"id\": \"a09f9424\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"encoded_inputs = [tokenizer(text, return_tensors=\\\"tf\\\") for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"17040431\",\n      \"metadata\": {\n        \"id\": \"17040431\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: no metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"44ddc21d\",\n      \"metadata\": {\n        \"id\": \"44ddc21d\"\n      },\n      \"source\": [\n        \"It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f9d934f6\",\n      \"metadata\": {\n        \"id\": \"f9d934f6\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from speedster import optimize_model, save_model, load_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"76248033\",\n      \"metadata\": {\n        \"id\": \"76248033\"\n      },\n      \"source\": [\n        \"Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"zPC_EDwEJIM0\",\n      \"metadata\": {\n        \"id\": \"zPC_EDwEJIM0\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"dynamic_info = {\\n\",\n        \"    \\\"inputs\\\": [\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"        {0: 'batch', 1: 'num_tokens'},\\n\",\n        \"    ],\\n\",\n        \"    \\\"outputs\\\": [\\n\",\n        \"        {0: \\\"batch\\\", 1: \\\"num_tokens\\\"},\\n\",\n        \"        {0: \\\"batch\\\"}\\n\",\n        \"    ]\\n\",\n        \"}\\n\",\n        \"\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    ignore_compilers=[\\\"tvm\\\"],\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"98c6ab09\",\n      \"metadata\": {\n        \"id\": \"98c6ab09\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import time\\n\",\n        \"\\n\",\n        \"encoded_inputs = [tokenizer(text, return_tensors=\\\"tf\\\") for text in texts]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"6e5b3b21\",\n      \"metadata\": {\n        \"id\": \"6e5b3b21\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the original model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"d3bc5c98\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"d3bc5c98\",\n        \"outputId\": \"e0596cf2-fa96-4c50-c012-f5cdab82e681\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    final_out = model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    final_out = model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original DistilBERT: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"12c2df98\",\n      \"metadata\": {\n        \"id\": \"12c2df98\"\n      },\n      \"source\": [\n        \"Let's see the output of the original model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"4892a905\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"4892a905\",\n        \"outputId\": \"68d9b65f-e2cc-4998-8047-c9091f977698\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"3db0a7a1\",\n      \"metadata\": {\n        \"id\": \"3db0a7a1\"\n      },\n      \"source\": [\n        \"Let's run the prediction 100 times to calculate the average response time of the optimized model.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"a3e83997\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"a3e83997\",\n        \"outputId\": \"7a416b14-f170-4df9-d416-026f06a7d980\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    final_out = optimized_model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    final_out = optimized_model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized BERT (no metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"0d884d61\",\n      \"metadata\": {\n        \"id\": \"0d884d61\"\n      },\n      \"source\": [\n        \"Let's see the output of the optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"75611b2e\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"75611b2e\",\n        \"outputId\": \"035d5c6d-fd7a-4506-af09-befcf9dd3b2d\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Speed up inference with Speedster: metric drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"7b1950d5\",\n      \"metadata\": {\n        \"id\": \"7b1950d5\"\n      },\n      \"source\": [\n        \"This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"de5721d8\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"de5721d8\",\n        \"outputId\": \"c9efff21-f963-47ff-e83d-a44615f90a10\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=encoded_inputs,\\n\",\n        \"    optimization_time=\\\"constrained\\\",\\n\",\n        \"    dynamic_info=dynamic_info,\\n\",\n        \"    ignore_compilers=[\\\"tvm\\\"],\\n\",\n        \"    metric_drop_ths=0.1,\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"0fbfe6fa\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"0fbfe6fa\",\n        \"outputId\": \"ada293f5-9b54-4186-8e48-74b994d4b797\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    final_out = model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    final_out = model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"original_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for original BERT: {original_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f89b7e6d\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"f89b7e6d\",\n        \"outputId\": \"51e497e1-a533-432d-d68e-b373f0ef69cb\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model(**encoded_input)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"10d17b5c\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"10d17b5c\",\n        \"outputId\": \"d5dc0acd-77e7-4054-b455-19343ff37951\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"times = []\\n\",\n        \"\\n\",\n        \"# Warmup for 30 iterations\\n\",\n        \"for encoded_input in encoded_inputs[:30]:\\n\",\n        \"    final_out = optimized_model(**encoded_input)\\n\",\n        \"\\n\",\n        \"# Benchmark\\n\",\n        \"for encoded_input in encoded_inputs:\\n\",\n        \"    st = time.time()\\n\",\n        \"    final_out = optimized_model(**encoded_input)\\n\",\n        \"    times.append(time.time()-st)\\n\",\n        \"optimized_model_time = sum(times)/len(times)*1000\\n\",\n        \"print(f\\\"Average response time for optimized BERT (metric drop): {optimized_model_time} ms\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"6bf3d1fb\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"6bf3d1fb\",\n        \"outputId\": \"6163d8ba-254f-47d2-a468-a921622a15ba\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model(**encoded_input)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d9eda1a0\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"62b6fcbf\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c968d51\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"c1340c49\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"cb234e5e\",\n      \"metadata\": {\n        \"id\": \"cb234e5e\"\n      },\n      \"source\": [\n        \"Great! Was it easy? How are the results? Do you have any comments?\\n\",\n        \"Share your optimization results and thoughts with <a href=\\\"https://discord.gg/RbeQMu886J\\\" target=\\\"_blank\\\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\\n\",\n        \"\\n\",\n        \"Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\\n\",\n        \"\\n\",\n        \"If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord.\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"collapsed_sections\": [],\n      \"provenance\": []\n    },\n    \"gpuClass\": \"premium\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3.9.15 ('nebullvm_new')\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:52:10) \\n[Clang 14.0.6 ]\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"4fbc45cd27f7d363500c2e8640d9fdb717da4e1d8e4954a68e42b53d65ee27af\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/huggingface/Readme.md",
    "content": "# **Hugging Face Optimization**\n\nThis section contains all the available notebooks that show how to leverage Speedster to optimize Hugging Face models.\n\nHugging Face hosts models that can use either PyTorch or TensorFlow as backend. Both the backends are supported by Speedster.\n\n## Notebooks:\n| Notebook                                                                                                                                                                                                 | Description                                                                                      |                                                                                                                                                                                                                                                                |\n|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [Accelerate Hugging Face PyTorch GPT2](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb)             | Show how to optimize with Speedster the GPT2 model from Hugging Face with PyTorch backend.       | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb)       |\n| [Accelerate Hugging Face PyTorch BERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb)             | Show how to optimize with Speedster the BERT model from Hugging Face with PyTorch backend.       | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb)       |\n| [Accelerate Hugging Face PyTorch DistilBERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb) | Show how to optimize with Speedster the DistilBERT model from Hugging Face with PyTorch backend. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb) |                                                            |\n| [Accelerate Hugging Face TensorFlow BERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb)       | Show how to optimize with Speedster the BERT model from Hugging Face with TensorFlow backend.    | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb)    |\n| [Accelerate Hugging Face PyTorch T5](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb)                 | Show how to optimize with Speedster the T5 model from Hugging Face with PyTorch backend.         | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb)         |\n\n## Hugging Face API quick view:\n\n``` python\nfrom speedster import optimize_model\nfrom transformers import AlbertModel, AlbertTokenizer\n\n# Load Albert as example\nmodel = AlbertModel.from_pretrained(\"albert-base-v1\")\ntokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n\n# Case 1: dictionary input format\ntext = \"This is an example text for the huggingface model.\"\ninput_dict = tokenizer(text, return_tensors=\"pt\")  # set return_tensors=\"tf\" or \"np\" for tensorflow models\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n  model, input_data=[input_dict]\n)\n\n## Warmup the model\n## This step is necessary before the latency computation of the \n## optimized model in order to get reliable results.\n# for _ in range(10):\n#   optimized_model(**input_dict)\n\n# Try the optimized model\nres = optimized_model(**input_dict)\n\n# # Case 2: strings input format\n# input_data = [\n#     \"This is a test.\",\n#     \"Hi my name is John.\",\n#     \"The cat is on the table.\",\n# ]\n# tokenizer_args = dict(\n#     return_tensors=\"pt\",  # set return_tensors=\"tf\" or \"np\" for tensorflow models\n#     padding=\"longest\",\n#     truncation=True,\n# )\n# \n# # Run Speedster optimization\n# optimized_model = optimize_model(\n#   model, input_data=input_data, tokenizer=tokenizer, tokenizer_args=tokenizer_args\n# )\n```\n"
  },
  {
    "path": "optimization/speedster/notebooks/huggingface/faster_transformer_bert.py",
    "content": "# %%\nimport logging\nimport random\nimport time\n\nimport speedster\nimport torch\nfrom speedster import optimize_model\n\n# %%\nfrom nebullvm.operations.optimizations.compilers.faster_transformer.bert import (  # noqa: E501\n    detect_and_swap_bert_model,\n)\n\n# %%\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    get_faster_transformer_repo_path,\n)\nfrom transformers import BertTokenizer\nfrom transformers.models.bert.modeling_bert import (\n    BertForSequenceClassification as HFBertForSequenceClassification,\n)\n\n# %%\nprint(speedster.__file__)\nlib_path = str(\n    get_faster_transformer_repo_path()\n    / \"build\"\n    / \"lib\"\n    / \"libth_transformer.so\"\n)\n\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n# %%\n# https://huggingface.co/bert-base-cased-finetuned-mrpc\n\n\n# %%\ndef prepare_examples(tokenizer, len_dataset=1000):\n    sentences = [\n        \"Mars is the fourth planet from the Sun.\",\n        \"has a crust primarily composed of elements\",\n        \"However, it is unknown\",\n        \"can be viewed from Earth\",\n        \"It was the Romans\",\n    ]\n    texts = []\n    for _ in range(len_dataset):\n        n_times = random.randint(1, 30)\n        texts.append(\n            \" \".join(random.choice(sentences) for _ in range(n_times))\n        )\n    encoded_inputs = [\n        tokenizer(text, return_tensors=\"pt\", truncation=True).to(device)\n        for text in texts\n    ]\n    len(encoded_inputs), encoded_inputs[0].keys()\n    fake_input_id = torch.LongTensor(per_gpu_eval_batch_size, max_seq_length)\n    fake_input_id.fill_(1)\n    fake_input_id = fake_input_id.to(device)\n    fake_mask = torch.ones(per_gpu_eval_batch_size, max_seq_length).to(device)\n    fake_type_id = fake_input_id.clone().detach()\n    if data_type == \"fp16\":\n        fake_mask = fake_mask.half()\n    elif data_type == \"bf16\":\n        fake_mask = fake_mask.bfloat16()\n    return encoded_inputs, fake_input_id, fake_mask, fake_type_id\n\n\n# %%\nlogger = logging.getLogger(__name__)\nuse_ths = use_torchscript = False\nremove_padding = False\ndata_type = \"fp16\"  # \"fp32\", \"fp16\", \"bf16\"\n\nper_gpu_eval_batch_size = 1\nmax_seq_length = 128\nmodel_name_or_path = \"bert-base-cased-finetuned-mrpc\"\n\n\nmodel = HFBertForSequenceClassification.from_pretrained(\n    model_name_or_path, torchscript=True\n)\nmodel.eval().to(device)\ntokenizer = BertTokenizer.from_pretrained(model_name_or_path)\nencoded_inputs, fake_input_id, fake_mask, fake_type_id = prepare_examples(\n    tokenizer\n)\n\n\ndef optimize_no_trace(model, data_type=\"fp16\"):\n    model = detect_and_swap_bert_model(\n        model, data_type=\"fp16\", lib_path=lib_path, remove_padding=False\n    )\n    if data_type == \"fp16\":\n        logger.info(\"Use fp16\")\n        model.half()\n    elif data_type == \"bf16\":\n        logger.info(\"Use bf16\")\n        model.bfloat16()\n    return model.to(device)\n\n\ndef optimize_with_trace(\n    model, data_type, per_gpu_eval_batch_size, max_seq_length\n):\n    model = optimize_no_trace(model, data_type)\n    logger.info(\"Use TorchScript mode\")\n    fake_input_id = torch.LongTensor(per_gpu_eval_batch_size, max_seq_length)\n    fake_input_id.fill_(1)\n    fake_input_id = fake_input_id.to(device)\n    fake_mask = torch.ones(per_gpu_eval_batch_size, max_seq_length).to(device)\n    fake_type_id = fake_input_id.clone().detach()\n    if data_type == \"fp16\":\n        fake_mask = fake_mask.half()\n    elif data_type == \"bf16\":\n        fake_mask = fake_mask.bfloat16()\n    model.eval()\n    with torch.no_grad():\n        model_ = torch.jit.trace(\n            model, (fake_input_id, fake_mask, fake_type_id)\n        )\n    return model_\n\n\ndef benchmark(model, model_desc=\"original BERT\"):\n    times = []\n\n    # Warmup for 30 iterations\n    for encoded_input in encoded_inputs[:30]:\n        with torch.no_grad():\n            _ = model(**encoded_input)\n\n    # Benchmark\n    for encoded_input in encoded_inputs:\n        st = time.perf_counter()\n        with torch.no_grad():\n            _ = model(**encoded_input)\n        times.append(time.perf_counter() - st)\n    original_model_time = sum(times) / len(times) * 1000\n    print(f\"Average response time for {model_desc}: {original_model_time} ms\")\n\n\nprint(f\"{encoded_inputs[0].keys()}\")\n\n\nbenchmark(model, \"BERT\")\nbenchmark(model, \"BERT\")\ndata_type = \"fp16\"  # \"fp32\", \"fp16\", \"bf16\nper_gpu_eval_batch_size = 1\nmax_seq_length = 128\nfaster_model = optimize_no_trace(model, data_type)\nbenchmark(faster_model, \"faster BERT (no metric drop)\")\n# Average response time for BERT: 4.741025467636064 ms\n# Average response time for BERT: 4.686204055091366 ms\n\nfastest_model = optimize_with_trace(\n    model, data_type, per_gpu_eval_batch_size, max_seq_length\n)\n\nbenchmark(fastest_model, \"fastest BERT (no metric drop)\")\n# Average response time for faster BERT (no metric drop): 1.5583459960762411 ms # noqa: E501\n\n\n# the above operations modifies `model` in-place\n# so we need reload a fresh one to test speedster\nmodel = HFBertForSequenceClassification.from_pretrained(\n    model_name_or_path, torchscript=True\n)\n# Average response time for fastest BERT (no metric drop): 1.4657320715487003 ms # noqa: E501\n\nmodel.eval().to(device)\ndynamic_info = {\n    \"inputs\": [\n        {0: \"batch\", 1: \"num_tokens\"},\n        {0: \"batch\", 1: \"num_tokens\"},\n        {0: \"batch\", 1: \"num_tokens\"},\n    ],\n    \"outputs\": [{0: \"batch\", 1: \"num_tokens\"}],\n}\nspeedster_optimized_model = optimize_model(\n    model=model,\n    input_data=encoded_inputs,\n    optimization_time=\"constrained\",\n    # force it to use fastertransformer\n    ignore_compilers=[\"tensor_rt\", \"tvm\", \"onnxruntime\", \"torchscript\"],\n    dynamic_info=dynamic_info,\n)\n\n\nbenchmark(\n    speedster_optimized_model, \"speedster optimized BERT (no metric drop)\"\n)\nbenchmark(\n    speedster_optimized_model, \"speedster optimized BERT (no metric drop)\"\n)\n# Average response time for speedster optimized BERT (no metric drop): 14.040142675396055 ms # noqa: E501\n# Average response time for speedster optimized BERT (no metric drop): 3.4986357542220503 ms # noqa: E501\nspeedster_optimized_model_fp16 = optimize_model(\n    model=model,\n    input_data=encoded_inputs,\n    optimization_time=\"constrained\",\n    # force it to use fastertransformer\n    ignore_compilers=[\"tensor_rt\", \"tvm\", \"onnxruntime\", \"torchscript\"],\n    dynamic_info=dynamic_info,\n    metric_drop_ths=0.1,\n)\n\n\nbenchmark(\n    speedster_optimized_model_fp16, \"speedster optimized BERT (metric drop)\"\n)\nbenchmark(\n    speedster_optimized_model_fp16, \"speedster optimized BERT (metric drop)\"\n)\n# Average response time for speedster optimized BERT (no metric drop): 14.040142675396055 ms # noqa: E501\n# Average response time for speedster optimized BERT (no metric drop): 3.4986357542220503 ms # noqa: E501\n"
  },
  {
    "path": "optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"p5b0PzpW1xJq\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {},\n      \"source\": [\n        \"# Accelerate ONNX ResNet50 with Speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"T9xuwZEHzN2K\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library `nebullvm`.\\n\",\n        \"\\n\",\n        \"We will\\n\",\n        \"1. Install Speedster and the deep learning compilers used by the library.\\n\",\n        \"2. Speed up an ONNX ResNet50 without any loss of accuracy.\\n\",\n        \"3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"5Yc5KYo_YzE8\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"HbFy2Aykz2Qo\"\n      },\n      \"source\": [\n        \"# Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"48aljCHu14-H\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install Speedster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"QFQh3BVr1-GO\",\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8a7a86b3\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cffbfa32\",\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks onnx --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"N5RXHoZl0p3p\"\n      },\n      \"source\": [\n        \"# Optimization example with ONNX\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"-Ju-VcRH01Mw\"\n      },\n      \"source\": [\n        \"In the following example we will try to optimize a standard ONNX resnet50.\\n\",\n        \"\\n\",\n        \"Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://nebuly.gitbook.io/nebuly/nebullvm/get-started).\\n\",\n        \"\\n\",\n        \"Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \\\"accuracy\\\", metric_drop_ths = 0.02).\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"skxEuemn171G\"\n      },\n      \"source\": [\n        \"## Scenario 1 - No accuracy drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"wVRLXrDi2VaG\"\n      },\n      \"source\": [\n        \"First of all we download the pretrained ONNX resnet50 model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"6I5GDvWbZ-LJ\",\n        \"outputId\": \"6ac09b39-9c6e-4d38-dfb6-35069938f9c1\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"vrkOvGfkaXk7\"\n      },\n      \"source\": [\n        \"Then we optimize it with Speedster simple API\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"2RbgGruAeQcf\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import numpy as np\\n\",\n        \"from speedster import optimize_model, save_model, load_model\\n\",\n        \"\\n\",\n        \"# Load a resnet as example\\n\",\n        \"model = \\\"resnet50-v1-12.onnx\\\"\\n\",\n        \"\\n\",\n        \"# Provide an input data for the model    \\n\",\n        \"input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0]))]\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\"\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"x = np.random.randn(1, 3, 224, 224).astype(np.float32)\\n\",\n        \"res_optimized = optimized_model(x)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"i2IKNc2jbax8\"\n      },\n      \"source\": [\n        \"We can print the type of the optimized model to see which compiler was faster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"dFhqAhr0bcbZ\",\n        \"outputId\": \"aa0b2fe9-2fa0-405b-8e44-3ebbf70f0e69\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"_UuiqkEfcPy4\"\n      },\n      \"source\": [\n        \"In our case, the optimized model type was NumpyONNXInferenceLearner, so this means that onnxruntime was the faster compiler.\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"E4759DQJcc15\"\n      },\n      \"source\": [\n        \"After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"ktQaNfGqceOD\"\n      },\n      \"source\": [\n        \"First of all, let's compute and print the original model result\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"gUMlNAZrcj5-\",\n        \"outputId\": \"3670f41f-b2db-4b55-dbf7-c9b0a0146c9d\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import onnx\\n\",\n        \"import onnxruntime as ort\\n\",\n        \"from typing import Dict, List\\n\",\n        \"\\n\",\n        \"\\n\",\n        \"def get_input_names(onnx_model: str):\\n\",\n        \"    model = onnx.load(onnx_model)\\n\",\n        \"    input_all = [node.name for node in model.graph.input]\\n\",\n        \"    return input_all\\n\",\n        \"\\n\",\n        \"\\n\",\n        \"def get_output_names(onnx_model: str):\\n\",\n        \"    model = onnx.load(onnx_model)\\n\",\n        \"    output_all = [node.name for node in model.graph.output]\\n\",\n        \"    return output_all\\n\",\n        \"\\n\",\n        \"\\n\",\n        \"def run_onnx_model(\\n\",\n        \"    onnx_model: str, session: ort.InferenceSession, input_tensors: List[np.ndarray], inputs: Dict, output_names: str\\n\",\n        \") -> List[np.ndarray]:\\n\",\n        \"    \\n\",\n        \"    res = session.run(\\n\",\n        \"        output_names=output_names, input_feed=inputs\\n\",\n        \"    )\\n\",\n        \"    return list(res)\\n\",\n        \"\\n\",\n        \"\\n\",\n        \"session = ort.InferenceSession(\\n\",\n        \"    model,\\n\",\n        \"    providers=[\\\"CUDAExecutionProvider\\\", \\\"CPUExecutionProvider\\\"] # Change to [\\\"CPUExecutionProvider\\\"] if run on cpu\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"inputs = {\\n\",\n        \"    name: array\\n\",\n        \"    for name, array in zip(get_input_names(model), [x])\\n\",\n        \"}\\n\",\n        \"\\n\",\n        \"res_original = run_onnx_model(model, session, [x], inputs, get_output_names(model))\\n\",\n        \"res_original\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"iU3dPwSTfWr_\"\n      },\n      \"source\": [\n        \"Then, let's print the optimized model result that we computed before\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"S1EKoJ75fVAh\",\n        \"outputId\": \"73e7b127-e7d3-44a9-bd78-65961bd051df\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"res_optimized\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"Lj4crPMmf_LX\"\n      },\n      \"source\": [\n        \"Then, let's compute the average latency of the baseline model:\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"rGNKr_ShgBbu\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import time\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"I2G4OzhCgG_D\",\n        \"outputId\": \"a23eb4ea-fa0f-4221-a177-20876e452b53\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"num_iters = 100\\n\",\n        \"\\n\",\n        \"# Warmup\\n\",\n        \"for i in range(10):\\n\",\n        \"  run_onnx_model(model, session, [x], inputs, get_output_names(model))\\n\",\n        \"\\n\",\n        \"start = time.time()\\n\",\n        \"for i in range(num_iters):\\n\",\n        \"  run_onnx_model(model, session, [x], inputs, get_output_names(model))\\n\",\n        \"stop = time.time()\\n\",\n        \"\\n\",\n        \"print(\\\"Average latency original model: {:.4f} seconds\\\".format((stop - start) / num_iters))\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"f-jmRjJvgW5V\"\n      },\n      \"source\": [\n        \"Finally we compute the average latency for the optimized model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"51c3uaMcgaR-\",\n        \"outputId\": \"1319a7bc-df1d-4f19-9426-3940ab4a7c5e\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Warmup\\n\",\n        \"for i in range(10):\\n\",\n        \"  optimized_model(x)\\n\",\n        \"\\n\",\n        \"start = time.time()\\n\",\n        \"for i in range(num_iters):\\n\",\n        \"  optimized_model(x)\\n\",\n        \"stop = time.time()\\n\",\n        \"\\n\",\n        \"print(\\\"Average latency optimized model: {:.4f} seconds\\\".format((stop - start) / num_iters))\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"tBeRKNTI3iyK\"\n      },\n      \"source\": [\n        \"## Scenario 2 - Accuracy drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"w3wutIzfAMe_\"\n      },\n      \"source\": [\n        \"In this scenario, we set a max threshold for the accuracy drop to 2%\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"fO1nGqpj3p7z\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import numpy as np\\n\",\n        \"from speedster import optimize_model\\n\",\n        \"\\n\",\n        \"# Load a resnet as example\\n\",\n        \"model = \\\"resnet50-v1-12.onnx\\\"\\n\",\n        \"\\n\",\n        \"# Provide an input data for the model\\n\",\n        \"# Note that in this case we should provide the model at least 100 data samples\\n\",\n        \"input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for i in range(100)]\\n\",\n        \"\\n\",\n        \"# Run nebullvm optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\", metric = \\\"accuracy\\\", metric_drop_ths = 0.02\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"x = np.random.randn(1, 3, 224, 224).astype(np.float32)\\n\",\n        \"res_optimized = optimized_model(x)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"4UFtwZbEiLv3\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the baseline model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"qFKHaHM6-GKm\",\n        \"outputId\": \"73b95996-4d1f-4aa7-a96d-a40070bf36bd\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"num_iters = 100\\n\",\n        \"\\n\",\n        \"# Warmup\\n\",\n        \"for i in range(10):\\n\",\n        \"  run_onnx_model(model, session, [x], inputs, get_output_names(model))\\n\",\n        \"\\n\",\n        \"start = time.time()\\n\",\n        \"for i in range(num_iters):\\n\",\n        \"  run_onnx_model(model, session, [x], inputs, get_output_names(model))\\n\",\n        \"stop = time.time()\\n\",\n        \"\\n\",\n        \"print(\\\"Average latency original model: {:.4f} seconds\\\".format((stop - start) / num_iters))\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"J8g0aJRJiXA5\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the optimized model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"_IbAW0KA4Fm5\",\n        \"outputId\": \"67f44401-9568-4f38-802a-d81e3139af5a\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Warmup\\n\",\n        \"for i in range(10):\\n\",\n        \"  optimized_model(x)\\n\",\n        \"\\n\",\n        \"start = time.time()\\n\",\n        \"for i in range(num_iters):\\n\",\n        \"  optimized_model(x)\\n\",\n        \"stop = time.time()\\n\",\n        \"\\n\",\n        \"print(\\\"Average latency optimized model: {:.4f} seconds\\\".format((stop - start) / num_iters))\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d9eda1a0\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"62b6fcbf\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c968d51\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"c1340c49\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"collapsed_sections\": [],\n      \"provenance\": []\n    },\n    \"gpuClass\": \"standard\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3.8.10 64-bit\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.8.9 (default, Apr 13 2022, 08:48:06) \\n[Clang 13.1.6 (clang-1316.0.21.2.5)]\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/onnx/Readme.md",
    "content": "# **ONNX Optimization**\n\nThis section contains all the available notebooks that show how to leverage Speedster to optimize ONNX models.\n\n## Notebooks:\n| Notebook                                                                                                                                                      | Description                                                          |                                                                                                                                                                                                                                       |\n|:--------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [Accelerate ONNX Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model in ONNX format. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb) |\n\n\n## ONNX API quick view:\n\n```python\nimport numpy as np\nfrom speedster import optimize_model\n\n# Load a resnet as example\n# Model was downloaded from here: \n# https://github.com/onnx/models/tree/main/vision/classification/resnet\nmodel = \"resnet50-v1-12.onnx\"\n\n# Provide an input data for the model    \ninput_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0]))]\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n  model, input_data=input_data, optimization_time=\"unconstrained\"\n)\n\n# Try the optimized model\nx = np.random.randn(1, 3, 224, 224).astype(np.float32)\n\n## Warmup the model\n## This step is necessary before the latency computation of the \n## optimized model in order to get reliable results.\n# for _ in range(10):\n#   optimized_model(x)\n\nres_optimized = optimized_model(x)\n```\n"
  },
  {
    "path": "optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"p5b0PzpW1xJq\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {},\n      \"source\": [\n        \"# Accelerate PyTorch ResNet50 with Speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"T9xuwZEHzN2K\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using Speedster app from the open-source library `nebullvm`.\\n\",\n        \"\\n\",\n        \"We will\\n\",\n        \"1. Install Speedster and the deep learning compilers used by the library.\\n\",\n        \"2. Speed up a PyTorch ResNet50 without any loss of accuracy.\\n\",\n        \"3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"_0ZRCXCR9693\",\n        \"outputId\": \"19096862-5c5c-4f9f-b2ad-3ce084ccf213\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"HbFy2Aykz2Qo\"\n      },\n      \"source\": [\n        \"### Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"ZPJHVZ74d8r2\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"b0CLgQqxyrQi\"\n      },\n      \"source\": [\n        \"Let's now import install the deep learning compilers used by Speedster that are not yet installed on the hardware.\\n\",\n        \"\\n\",\n        \"The installation of the compilers may take a few minutes.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"GvK9mZSjeLU5\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"N5RXHoZl0p3p\"\n      },\n      \"source\": [\n        \"## Optimization example with Pytorch\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"-Ju-VcRH01Mw\"\n      },\n      \"source\": [\n        \"In the following example we will try to optimize a standard resnet50 loaded directly from torchvision.\\n\",\n        \"\\n\",\n        \"Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\\n\",\n        \"\\n\",\n        \"Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \\\"accuracy\\\", metric_drop_ths = 0.02).\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"skxEuemn171G\"\n      },\n      \"source\": [\n        \"### Scenario 1 - No accuracy drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"wVRLXrDi2VaG\"\n      },\n      \"source\": [\n        \"First we load the model and optimize it using the Speedster API:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"2RbgGruAeQcf\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"import torchvision.models as models\\n\",\n        \"from speedster import optimize_model, save_model, load_model\\n\",\n        \"\\n\",\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"\\n\",\n        \"# Load a resnet as example\\n\",\n        \"model = models.resnet50().to(device)\\n\",\n        \"\\n\",\n        \"# Provide an input data for the model    \\n\",\n        \"input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\"\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"x = torch.randn(1, 3, 256, 256).to(device)\\n\",\n        \"model.eval()\\n\",\n        \"res_optimized = optimized_model(x)\\n\",\n        \"res_original = model(x)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"JMiuufyu2gD3\"\n      },\n      \"source\": [\n        \"We can print the type of the optimized model to see which compiler was faster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"ifuLyQsM9697\",\n        \"outputId\": \"c1534e0d-e5bb-4d44-91e9-652593751d52\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"4WxcxrUC9698\"\n      },\n      \"source\": [\n        \"In our case, the optimized model type was PytorchTensorRTInferenceLearner, so this means that Pytorch-TensorRT was the faster compiler.\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"iwHKfT349698\"\n      },\n      \"source\": [\n        \"After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"-IMJpfcb9698\"\n      },\n      \"source\": [\n        \"First of all, let's print the results\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"uI8Kd1Z49698\",\n        \"outputId\": \"832d3053-d6c8-4cc2-9b48-a59dfaa45d33\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"res_original\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"0I_zSpv29698\",\n        \"outputId\": \"a0ba566d-6730-4954-8dd0-eb47b549cbf1\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"res_optimized\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"hBEtrYOd9699\"\n      },\n      \"source\": [\n        \"Then, let's compare the performances:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"GqxiCAbpfcwV\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from nebullvm.tools.benchmark import benchmark\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"_0b0Bzwq-czD\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Set the model to eval mode and move it to the available device\\n\",\n        \"\\n\",\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"\\n\",\n        \"model.eval()\\n\",\n        \"model.to(device)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"UqxzStjD2v0r\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the baseline model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"dkt67_Orwlv4\",\n        \"outputId\": \"fc10c03c-c3ad-44d4-9fd6-c9b6dc0256c7\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(model, input_data)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"AgOv-GqQ3KIC\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the optimized model:\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"4PodpaDVfwzT\",\n        \"outputId\": \"27a42560-93a2-4c19-e68d-360093fe914c\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(optimized_model, input_data)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"tBeRKNTI3iyK\"\n      },\n      \"source\": [\n        \"## Scenario 2 - Accuracy drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"w3wutIzfAMe_\"\n      },\n      \"source\": [\n        \"In this scenario, we set a max threshold for the accuracy drop to 2%\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"fO1nGqpj3p7z\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"import torchvision.models as models\\n\",\n        \"from speedster import optimize_model\\n\",\n        \"\\n\",\n        \"# Load a resnet as example\\n\",\n        \"model = models.resnet50().to(device)\\n\",\n        \"\\n\",\n        \"# Provide 100 random input data for the model  \\n\",\n        \"input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\", metric=\\\"accuracy\\\", metric_drop_ths=0.02\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"x = torch.randn(1, 3, 256, 256).to(device)\\n\",\n        \"res = optimized_model(x)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"qFKHaHM6-GKm\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Set the model to eval mode and move it to the available device\\n\",\n        \"\\n\",\n        \"model.eval()\\n\",\n        \"model.to(device)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"yfW9kmHX-pGi\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the baseline model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"0MMrL3959hli\",\n        \"outputId\": \"2e8d27ec-a9f3-4f70-8c75-a0df974f2653\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(model, input_data)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"i3GqasOM-u8f\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the optimized model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"_IbAW0KA4Fm5\",\n        \"outputId\": \"48d83c89-5687-42aa-a3b8-6989bcb66aa6\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(optimized_model, input_data)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d9eda1a0\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"62b6fcbf\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c968d51\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"c1340c49\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"collapsed_sections\": [],\n      \"provenance\": []\n    },\n    \"gpuClass\": \"standard\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3.8.10 64-bit\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.8.9 (default, Apr 13 2022, 08:48:06) \\n[Clang 13.1.6 (clang-1316.0.21.2.5)]\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"p5b0PzpW1xJq\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"metadata\": {},\n      \"source\": [\n        \"# Accelerate PyTorch VisionTransformer with Speedster\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"T9xuwZEHzN2K\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using Speedster app from the open-source library `nebullvm`.\\n\",\n        \"\\n\",\n        \"We will\\n\",\n        \"1. Install Speedster and the deep learning compilers used by the library.\\n\",\n        \"2. Speed up a PyTorch ViT without any loss of accuracy.\\n\",\n        \"3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"_0ZRCXCR9693\",\n        \"outputId\": \"19096862-5c5c-4f9f-b2ad-3ce084ccf213\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"HbFy2Aykz2Qo\"\n      },\n      \"source\": [\n        \"### Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"ZPJHVZ74d8r2\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"b0CLgQqxyrQi\"\n      },\n      \"source\": [\n        \"Let's now import install the deep learning compilers used by Speedster that are not yet installed on the hardware.\\n\",\n        \"\\n\",\n        \"The installation of the compilers may take a few minutes.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"GvK9mZSjeLU5\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"N5RXHoZl0p3p\"\n      },\n      \"source\": [\n        \"## Optimization example with Pytorch\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"-Ju-VcRH01Mw\"\n      },\n      \"source\": [\n        \"In the following example we will try to optimize a ViT model loaded directly from vit_pytorch library.\\n\",\n        \"\\n\",\n        \"Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\\n\",\n        \"\\n\",\n        \"Let's first test the optimization without any loss in accuracy (metric_drop_ths=0, which is the default value), and then attempt to further accelerate it while constraining the loss of accuracy to a maximum of 2% (metric = 'accuracy', metric_drop_ths = 0.02).\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"skxEuemn171G\"\n      },\n      \"source\": [\n        \"### Scenario 1 - No accuracy drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"wVRLXrDi2VaG\"\n      },\n      \"source\": [\n        \"First we load the model and optimize it using the Speedster API:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"2RbgGruAeQcf\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"from vit_pytorch import ViT\\n\",\n        \"from speedster import optimize_model, save_model, load_model\\n\",\n        \"\\n\",\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"\\n\",\n        \"# Load a ViT model\\n\",\n        \"model = ViT(\\n\",\n        \"    image_size = 256,\\n\",\n        \"    patch_size = 32,\\n\",\n        \"    num_classes = 1000,\\n\",\n        \"    dim = 1024,\\n\",\n        \"    depth = 6,\\n\",\n        \"    heads = 16,\\n\",\n        \"    mlp_dim = 2048,\\n\",\n        \"    dropout = 0.1,\\n\",\n        \"    emb_dropout = 0.1\\n\",\n        \").to(device)\\n\",\n        \"\\n\",\n        \"# Provide an input data for the model    \\n\",\n        \"input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\"\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"x = torch.randn(1, 3, 256, 256).to(device)\\n\",\n        \"model.to(device).eval()\\n\",\n        \"res_optimized = optimized_model(x)\\n\",\n        \"res_original = model(x)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"JMiuufyu2gD3\"\n      },\n      \"source\": [\n        \"We can print the type of the optimized model to see which compiler was faster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"ifuLyQsM9697\",\n        \"outputId\": \"c1534e0d-e5bb-4d44-91e9-652593751d52\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"4WxcxrUC9698\"\n      },\n      \"source\": [\n        \"In our case, the optimized model type was TorchScriptInferenceLearner, so this means that TorchScriptCompiler was the faster compiler.\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"iwHKfT349698\"\n      },\n      \"source\": [\n        \"After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"-IMJpfcb9698\"\n      },\n      \"source\": [\n        \"First of all, let's print the results\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"uI8Kd1Z49698\",\n        \"outputId\": \"832d3053-d6c8-4cc2-9b48-a59dfaa45d33\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"res_original\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"0I_zSpv29698\",\n        \"outputId\": \"a0ba566d-6730-4954-8dd0-eb47b549cbf1\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"res_optimized\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"hBEtrYOd9699\"\n      },\n      \"source\": [\n        \"Then, let's compare the performances:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": 3,\n      \"metadata\": {\n        \"id\": \"GqxiCAbpfcwV\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from nebullvm.tools.benchmark import benchmark\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"_0b0Bzwq-czD\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Set the model to eval mode and move it to the available device\\n\",\n        \"\\n\",\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"\\n\",\n        \"model.eval()\\n\",\n        \"model.to(device)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"UqxzStjD2v0r\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the baseline model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"dkt67_Orwlv4\",\n        \"outputId\": \"fc10c03c-c3ad-44d4-9fd6-c9b6dc0256c7\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(model, input_data)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"AgOv-GqQ3KIC\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the optimized model:\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"4PodpaDVfwzT\",\n        \"outputId\": \"27a42560-93a2-4c19-e68d-360093fe914c\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(optimized_model, input_data)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"tBeRKNTI3iyK\"\n      },\n      \"source\": [\n        \"## Scenario 2 - Accuracy drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"w3wutIzfAMe_\"\n      },\n      \"source\": [\n        \"In this scenario, we set a max threshold for the accuracy drop to 2%\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"fO1nGqpj3p7z\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"import torchvision.models as models\\n\",\n        \"from speedster import optimize_model\\n\",\n        \"\\n\",\n        \"# Load a ViT model\\n\",\n        \"model = ViT(\\n\",\n        \"    image_size = 256,\\n\",\n        \"    patch_size = 32,\\n\",\n        \"    num_classes = 1000,\\n\",\n        \"    dim = 1024,\\n\",\n        \"    depth = 6,\\n\",\n        \"    heads = 16,\\n\",\n        \"    mlp_dim = 2048,\\n\",\n        \"    dropout = 0.1,\\n\",\n        \"    emb_dropout = 0.1\\n\",\n        \").to(device)\\n\",\n        \"\\n\",\n        \"# Provide 100 random input data for the model  \\n\",\n        \"input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\", metric=\\\"accuracy\\\", metric_drop_ths=0.02\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"x = torch.randn(1, 3, 256, 256).to(device)\\n\",\n        \"res = optimized_model(x)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"qFKHaHM6-GKm\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Set the model to eval mode and move it to the available device\\n\",\n        \"\\n\",\n        \"model.eval()\\n\",\n        \"model.to(device)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"yfW9kmHX-pGi\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the baseline model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"0MMrL3959hli\",\n        \"outputId\": \"2e8d27ec-a9f3-4f70-8c75-a0df974f2653\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(model, input_data)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"i3GqasOM-u8f\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the optimized model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"_IbAW0KA4Fm5\",\n        \"outputId\": \"48d83c89-5687-42aa-a3b8-6989bcb66aa6\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(optimized_model, input_data)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d9eda1a0\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": 13,\n      \"id\": \"62b6fcbf\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c968d51\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": 14,\n      \"id\": \"c1340c49\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"collapsed_sections\": [],\n      \"provenance\": []\n    },\n    \"gpuClass\": \"standard\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3.8.10 64-bit\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.8.16\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c977e4a\",\n      \"metadata\": {\n        \"id\": \"3c977e4a\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"6240f0ea\",\n      \"metadata\": {\n        \"id\": \"6240f0ea\"\n      },\n      \"source\": [\n        \"# Accelerate PyTorch YOLOv5 with Speedster\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"6cfcd562\",\n      \"metadata\": {\n        \"id\": \"6cfcd562\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\\n\",\n        \"\\n\",\n        \"With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"38171e92\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"okgu97ThVwnH\",\n      \"metadata\": {\n        \"id\": \"okgu97ThVwnH\"\n      },\n      \"source\": [\n        \"### Install Speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"48aljCHu14-H\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install Speedster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"QFQh3BVr1-GO\",\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8a7a86b3\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cffbfa32\",\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"e62f5afa\",\n      \"metadata\": {\n        \"id\": \"e62f5afa\"\n      },\n      \"source\": [\n        \"### Install and test YOLO\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"b38d727d\",\n      \"metadata\": {\n        \"id\": \"b38d727d\"\n      },\n      \"source\": [\n        \"Let's install YOLO.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f48f6a35\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"f48f6a35\",\n        \"outputId\": \"5b06307a-9196-4e5e-a542-1254d6c94ce2\",\n        \"scrolled\": true\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"! pip install -r https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"92f49833\",\n      \"metadata\": {\n        \"id\": \"92f49833\"\n      },\n      \"source\": [\n        \"We start by downloading the model from the Torch hub.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"2dc46f67\",\n      \"metadata\": {\n        \"id\": \"2dc46f67\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import copy\\n\",\n        \"import time\\n\",\n        \"import types\\n\",\n        \"\\n\",\n        \"import torch\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"ead6637d\",\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\",\n          \"height\": 248,\n          \"referenced_widgets\": [\n            \"7f41159d22fe4ce7b8e7789a92478242\",\n            \"2ecf6a6cfad64af698a88479ba95005b\",\n            \"e7a2646ac0cd4afba67823799147ce13\",\n            \"fd77306783b84b489b90d072a44a27d8\",\n            \"94a4bc5454074b5c900186a60a950d19\",\n            \"682cafb37aa34c75961d61d2665a50b7\",\n            \"5e71284dc02f4346b217732643c90b86\",\n            \"881f619ee75547a49c6d48fd3140721c\",\n            \"56a1b99b282a4a63a64f48347963a5ab\",\n            \"a59557bb103e4a3b96062c60d539db35\",\n            \"65786546f69b420b9ec8451c97338f30\"\n          ]\n        },\n        \"id\": \"ead6637d\",\n        \"outputId\": \"8d44d380-535d-446c-fcb0-bb55ba9e9f84\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Load Model\\n\",\n        \"model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, force_reload=True)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"KcteQ5tsWy1v\",\n      \"metadata\": {\n        \"id\": \"KcteQ5tsWy1v\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\\n\",\n        \"model.to(device)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"37d07ab0\",\n      \"metadata\": {\n        \"id\": \"37d07ab0\"\n      },\n      \"source\": [\n        \"## Optimization with Speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"332cbc38\",\n      \"metadata\": {\n        \"id\": \"332cbc38\"\n      },\n      \"source\": [\n        \"Now we are ready for optimizing the body of YOLOv5 using the `Speedster` function `optimize_model`.\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d1fc4d01\",\n      \"metadata\": {\n        \"id\": \"d1fc4d01\"\n      },\n      \"source\": [\n        \"Speedster was built to be very easy to use. To optimize a model, you only need to specify the model, the batch size and input size for each input tensor, and a directory in which to save the optimized model. In the example, we chose the same directory in which this notebook runs.\\n\",\n        \"\\n\",\n        \"With the latest API, there are two ways to use Speedster:\\n\",\n        \"\\n\",\n        \"- Option A: Accelerate the model up to ~10 times without losing in performances (accuracy/precision/etc.)\\n\",\n        \"- Option B: Accelerate the model up to ~30 times with a pre-defined maximum loss in performances\\n\",\n        \"    \\n\",\n        \"To learn more about how to use Speedster, check out the <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#-speedster\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> readme on GitHub </a>.\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb07403\",\n      \"metadata\": {\n        \"id\": \"ceb07403\"\n      },\n      \"source\": [\n        \"In this example, we provide the code to run option B.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"74f9f650\",\n      \"metadata\": {\n        \"id\": \"74f9f650\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from speedster import optimize_model, save_model, load_model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b729ccce\",\n      \"metadata\": {},\n      \"source\": [\n        \"Let's load some example data to feed the optimize_model function\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"20c15b09\",\n      \"metadata\": {\n        \"id\": \"20c15b09\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from PIL import Image\\n\",\n        \"import requests\\n\",\n        \"import numpy as np\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"8fcf6332\",\n      \"metadata\": {\n        \"id\": \"8fcf6332\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"img_name = \\\"zidane.png\\\"\\n\",\n        \"imgs = ['https://ultralytics.com/images/zidane.jpg']  # batch of images\\n\",\n        \"Image.open(requests.get(imgs[0], stream=True).raw).save(img_name)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"178a31f1\",\n      \"metadata\": {\n        \"id\": \"178a31f1\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"def read_and_crop(im, original_model, img_size):\\n\",\n        \"    p  =  next(original_model.parameters())\\n\",\n        \"    im = Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im)\\n\",\n        \"    max_y, max_x = im.size\\n\",\n        \"    ptr_x = np.random.choice(max_x-img_size[0])\\n\",\n        \"    ptr_y = np.random.choice(max_y-img_size[1])\\n\",\n        \"    im = np.array(im.crop((ptr_y, ptr_x, ptr_y + img_size[1], ptr_x + img_size[0])))\\n\",\n        \"    x = np.expand_dims(im, axis=0)\\n\",\n        \"    x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2)))  # stack and BHWC to BCHW\\n\",\n        \"    x = torch.from_numpy(x).to(p.device).type_as(p) / 255  # uint8 to fp16/32\\n\",\n        \"    return x\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"51757959\",\n      \"metadata\": {\n        \"id\": \"51757959\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"input_data = [((read_and_crop(img_name, model, (640, 640)),), None) for _ in range(100)]\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"c01adfeb\",\n      \"metadata\": {\n        \"id\": \"c01adfeb\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"model_optimized = optimize_model(\\n\",\n        \"    model=model,\\n\",\n        \"    input_data=input_data,\\n\",\n        \"    optimization_time=\\\"unconstrained\\\",\\n\",\n        \"    metric_drop_ths=0.05\\n\",\n        \")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"495c1642\",\n      \"metadata\": {},\n      \"source\": [\n        \"Let's compare the original model performance with the optimized one:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"82e39d5b\",\n      \"metadata\": {\n        \"id\": \"82e39d5b\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from nebullvm.tools.benchmark import benchmark\\n\",\n        \"\\n\",\n        \"original_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, force_reload=True)\\n\",\n        \"print(\\\"Benchmark original model\\\")\\n\",\n        \"benchmark(original_model, input_data)\\n\",\n        \"\\n\",\n        \"print(\\\"Benchmark optimized model\\\")\\n\",\n        \"benchmark(model_optimized, input_data)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"f0d6d006\",\n      \"metadata\": {},\n      \"source\": [\n        \"Let's ensure that the output of the original model is the same as the optimized model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"66c0dbab\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"input_tensor = torch.randn(1, 3, 640, 640).to(device)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"bfe573fd\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"model(input_tensor)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"89654058\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"model_optimized(input_tensor)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b72bdf54\",\n      \"metadata\": {},\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ada71f91\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"99b3a9d0\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(model_optimized, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"6308ddd7\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"f9946f6b\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"model_optimized = load_model(\\\"model_save_path\\\")\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"d50807de\",\n      \"metadata\": {\n        \"id\": \"d50807de\"\n      },\n      \"source\": [\n        \"What an amazing result, right?!? Stay tuned for more cool content from the Nebuly team :) \"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"collapsed_sections\": [],\n      \"provenance\": []\n    },\n    \"gpuClass\": \"standard\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e\"\n      }\n    },\n    \"widgets\": {\n      \"application/vnd.jupyter.widget-state+json\": {\n        \"2ecf6a6cfad64af698a88479ba95005b\": {\n          \"model_module\": \"@jupyter-widgets/controls\",\n          \"model_module_version\": \"1.5.0\",\n          \"model_name\": \"HTMLModel\",\n          \"state\": {\n            \"_dom_classes\": [],\n            \"_model_module\": \"@jupyter-widgets/controls\",\n            \"_model_module_version\": \"1.5.0\",\n            \"_model_name\": \"HTMLModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/controls\",\n            \"_view_module_version\": \"1.5.0\",\n            \"_view_name\": \"HTMLView\",\n            \"description\": \"\",\n            \"description_tooltip\": null,\n            \"layout\": \"IPY_MODEL_682cafb37aa34c75961d61d2665a50b7\",\n            \"placeholder\": \"​\",\n            \"style\": \"IPY_MODEL_5e71284dc02f4346b217732643c90b86\",\n            \"value\": \"100%\"\n          }\n        },\n        \"56a1b99b282a4a63a64f48347963a5ab\": {\n          \"model_module\": \"@jupyter-widgets/controls\",\n          \"model_module_version\": \"1.5.0\",\n          \"model_name\": \"ProgressStyleModel\",\n          \"state\": {\n            \"_model_module\": \"@jupyter-widgets/controls\",\n            \"_model_module_version\": \"1.5.0\",\n            \"_model_name\": \"ProgressStyleModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/base\",\n            \"_view_module_version\": \"1.2.0\",\n            \"_view_name\": \"StyleView\",\n            \"bar_color\": null,\n            \"description_width\": \"\"\n          }\n        },\n        \"5e71284dc02f4346b217732643c90b86\": {\n          \"model_module\": \"@jupyter-widgets/controls\",\n          \"model_module_version\": \"1.5.0\",\n          \"model_name\": \"DescriptionStyleModel\",\n          \"state\": {\n            \"_model_module\": \"@jupyter-widgets/controls\",\n            \"_model_module_version\": \"1.5.0\",\n            \"_model_name\": \"DescriptionStyleModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/base\",\n            \"_view_module_version\": \"1.2.0\",\n            \"_view_name\": \"StyleView\",\n            \"description_width\": \"\"\n          }\n        },\n        \"65786546f69b420b9ec8451c97338f30\": {\n          \"model_module\": \"@jupyter-widgets/controls\",\n          \"model_module_version\": \"1.5.0\",\n          \"model_name\": \"DescriptionStyleModel\",\n          \"state\": {\n            \"_model_module\": \"@jupyter-widgets/controls\",\n            \"_model_module_version\": \"1.5.0\",\n            \"_model_name\": \"DescriptionStyleModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/base\",\n            \"_view_module_version\": \"1.2.0\",\n            \"_view_name\": \"StyleView\",\n            \"description_width\": \"\"\n          }\n        },\n        \"682cafb37aa34c75961d61d2665a50b7\": {\n          \"model_module\": \"@jupyter-widgets/base\",\n          \"model_module_version\": \"1.2.0\",\n          \"model_name\": \"LayoutModel\",\n          \"state\": {\n            \"_model_module\": \"@jupyter-widgets/base\",\n            \"_model_module_version\": \"1.2.0\",\n            \"_model_name\": \"LayoutModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/base\",\n            \"_view_module_version\": \"1.2.0\",\n            \"_view_name\": \"LayoutView\",\n            \"align_content\": null,\n            \"align_items\": null,\n            \"align_self\": null,\n            \"border\": null,\n            \"bottom\": null,\n            \"display\": null,\n            \"flex\": null,\n            \"flex_flow\": null,\n            \"grid_area\": null,\n            \"grid_auto_columns\": null,\n            \"grid_auto_flow\": null,\n            \"grid_auto_rows\": null,\n            \"grid_column\": null,\n            \"grid_gap\": null,\n            \"grid_row\": null,\n            \"grid_template_areas\": null,\n            \"grid_template_columns\": null,\n            \"grid_template_rows\": null,\n            \"height\": null,\n            \"justify_content\": null,\n            \"justify_items\": null,\n            \"left\": null,\n            \"margin\": null,\n            \"max_height\": null,\n            \"max_width\": null,\n            \"min_height\": null,\n            \"min_width\": null,\n            \"object_fit\": null,\n            \"object_position\": null,\n            \"order\": null,\n            \"overflow\": null,\n            \"overflow_x\": null,\n            \"overflow_y\": null,\n            \"padding\": null,\n            \"right\": null,\n            \"top\": null,\n            \"visibility\": null,\n            \"width\": null\n          }\n        },\n        \"7f41159d22fe4ce7b8e7789a92478242\": {\n          \"model_module\": \"@jupyter-widgets/controls\",\n          \"model_module_version\": \"1.5.0\",\n          \"model_name\": \"HBoxModel\",\n          \"state\": {\n            \"_dom_classes\": [],\n            \"_model_module\": \"@jupyter-widgets/controls\",\n            \"_model_module_version\": \"1.5.0\",\n            \"_model_name\": \"HBoxModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/controls\",\n            \"_view_module_version\": \"1.5.0\",\n            \"_view_name\": \"HBoxView\",\n            \"box_style\": \"\",\n            \"children\": [\n              \"IPY_MODEL_2ecf6a6cfad64af698a88479ba95005b\",\n              \"IPY_MODEL_e7a2646ac0cd4afba67823799147ce13\",\n              \"IPY_MODEL_fd77306783b84b489b90d072a44a27d8\"\n            ],\n            \"layout\": \"IPY_MODEL_94a4bc5454074b5c900186a60a950d19\"\n          }\n        },\n        \"881f619ee75547a49c6d48fd3140721c\": {\n          \"model_module\": \"@jupyter-widgets/base\",\n          \"model_module_version\": \"1.2.0\",\n          \"model_name\": \"LayoutModel\",\n          \"state\": {\n            \"_model_module\": \"@jupyter-widgets/base\",\n            \"_model_module_version\": \"1.2.0\",\n            \"_model_name\": \"LayoutModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/base\",\n            \"_view_module_version\": \"1.2.0\",\n            \"_view_name\": \"LayoutView\",\n            \"align_content\": null,\n            \"align_items\": null,\n            \"align_self\": null,\n            \"border\": null,\n            \"bottom\": null,\n            \"display\": null,\n            \"flex\": null,\n            \"flex_flow\": null,\n            \"grid_area\": null,\n            \"grid_auto_columns\": null,\n            \"grid_auto_flow\": null,\n            \"grid_auto_rows\": null,\n            \"grid_column\": null,\n            \"grid_gap\": null,\n            \"grid_row\": null,\n            \"grid_template_areas\": null,\n            \"grid_template_columns\": null,\n            \"grid_template_rows\": null,\n            \"height\": null,\n            \"justify_content\": null,\n            \"justify_items\": null,\n            \"left\": null,\n            \"margin\": null,\n            \"max_height\": null,\n            \"max_width\": null,\n            \"min_height\": null,\n            \"min_width\": null,\n            \"object_fit\": null,\n            \"object_position\": null,\n            \"order\": null,\n            \"overflow\": null,\n            \"overflow_x\": null,\n            \"overflow_y\": null,\n            \"padding\": null,\n            \"right\": null,\n            \"top\": null,\n            \"visibility\": null,\n            \"width\": null\n          }\n        },\n        \"94a4bc5454074b5c900186a60a950d19\": {\n          \"model_module\": \"@jupyter-widgets/base\",\n          \"model_module_version\": \"1.2.0\",\n          \"model_name\": \"LayoutModel\",\n          \"state\": {\n            \"_model_module\": \"@jupyter-widgets/base\",\n            \"_model_module_version\": \"1.2.0\",\n            \"_model_name\": \"LayoutModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/base\",\n            \"_view_module_version\": \"1.2.0\",\n            \"_view_name\": \"LayoutView\",\n            \"align_content\": null,\n            \"align_items\": null,\n            \"align_self\": null,\n            \"border\": null,\n            \"bottom\": null,\n            \"display\": null,\n            \"flex\": null,\n            \"flex_flow\": null,\n            \"grid_area\": null,\n            \"grid_auto_columns\": null,\n            \"grid_auto_flow\": null,\n            \"grid_auto_rows\": null,\n            \"grid_column\": null,\n            \"grid_gap\": null,\n            \"grid_row\": null,\n            \"grid_template_areas\": null,\n            \"grid_template_columns\": null,\n            \"grid_template_rows\": null,\n            \"height\": null,\n            \"justify_content\": null,\n            \"justify_items\": null,\n            \"left\": null,\n            \"margin\": null,\n            \"max_height\": null,\n            \"max_width\": null,\n            \"min_height\": null,\n            \"min_width\": null,\n            \"object_fit\": null,\n            \"object_position\": null,\n            \"order\": null,\n            \"overflow\": null,\n            \"overflow_x\": null,\n            \"overflow_y\": null,\n            \"padding\": null,\n            \"right\": null,\n            \"top\": null,\n            \"visibility\": null,\n            \"width\": null\n          }\n        },\n        \"a59557bb103e4a3b96062c60d539db35\": {\n          \"model_module\": \"@jupyter-widgets/base\",\n          \"model_module_version\": \"1.2.0\",\n          \"model_name\": \"LayoutModel\",\n          \"state\": {\n            \"_model_module\": \"@jupyter-widgets/base\",\n            \"_model_module_version\": \"1.2.0\",\n            \"_model_name\": \"LayoutModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/base\",\n            \"_view_module_version\": \"1.2.0\",\n            \"_view_name\": \"LayoutView\",\n            \"align_content\": null,\n            \"align_items\": null,\n            \"align_self\": null,\n            \"border\": null,\n            \"bottom\": null,\n            \"display\": null,\n            \"flex\": null,\n            \"flex_flow\": null,\n            \"grid_area\": null,\n            \"grid_auto_columns\": null,\n            \"grid_auto_flow\": null,\n            \"grid_auto_rows\": null,\n            \"grid_column\": null,\n            \"grid_gap\": null,\n            \"grid_row\": null,\n            \"grid_template_areas\": null,\n            \"grid_template_columns\": null,\n            \"grid_template_rows\": null,\n            \"height\": null,\n            \"justify_content\": null,\n            \"justify_items\": null,\n            \"left\": null,\n            \"margin\": null,\n            \"max_height\": null,\n            \"max_width\": null,\n            \"min_height\": null,\n            \"min_width\": null,\n            \"object_fit\": null,\n            \"object_position\": null,\n            \"order\": null,\n            \"overflow\": null,\n            \"overflow_x\": null,\n            \"overflow_y\": null,\n            \"padding\": null,\n            \"right\": null,\n            \"top\": null,\n            \"visibility\": null,\n            \"width\": null\n          }\n        },\n        \"e7a2646ac0cd4afba67823799147ce13\": {\n          \"model_module\": \"@jupyter-widgets/controls\",\n          \"model_module_version\": \"1.5.0\",\n          \"model_name\": \"FloatProgressModel\",\n          \"state\": {\n            \"_dom_classes\": [],\n            \"_model_module\": \"@jupyter-widgets/controls\",\n            \"_model_module_version\": \"1.5.0\",\n            \"_model_name\": \"FloatProgressModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/controls\",\n            \"_view_module_version\": \"1.5.0\",\n            \"_view_name\": \"ProgressView\",\n            \"bar_style\": \"success\",\n            \"description\": \"\",\n            \"description_tooltip\": null,\n            \"layout\": \"IPY_MODEL_881f619ee75547a49c6d48fd3140721c\",\n            \"max\": 14808437,\n            \"min\": 0,\n            \"orientation\": \"horizontal\",\n            \"style\": \"IPY_MODEL_56a1b99b282a4a63a64f48347963a5ab\",\n            \"value\": 14808437\n          }\n        },\n        \"fd77306783b84b489b90d072a44a27d8\": {\n          \"model_module\": \"@jupyter-widgets/controls\",\n          \"model_module_version\": \"1.5.0\",\n          \"model_name\": \"HTMLModel\",\n          \"state\": {\n            \"_dom_classes\": [],\n            \"_model_module\": \"@jupyter-widgets/controls\",\n            \"_model_module_version\": \"1.5.0\",\n            \"_model_name\": \"HTMLModel\",\n            \"_view_count\": null,\n            \"_view_module\": \"@jupyter-widgets/controls\",\n            \"_view_module_version\": \"1.5.0\",\n            \"_view_name\": \"HTMLView\",\n            \"description\": \"\",\n            \"description_tooltip\": null,\n            \"layout\": \"IPY_MODEL_a59557bb103e4a3b96062c60d539db35\",\n            \"placeholder\": \"​\",\n            \"style\": \"IPY_MODEL_65786546f69b420b9ec8451c97338f30\",\n            \"value\": \" 14.1M/14.1M [00:00&lt;00:00, 24.5MB/s]\"\n          }\n        }\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![New Release: Accelerate YOLOv8](assets/yolov8.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Accelerate Ultralytics YOLOv8 with Speedster\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"6cfcd562\",\n   \"metadata\": {\n    \"id\": \"6cfcd562\"\n   },\n   \"source\": [\n    \"Hi and welcome 👋\\n\",\n    \"\\n\",\n    \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster module from the open-source library nebullvm.\\n\",\n    \"\\n\",\n    \"With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\\n\",\n    \"\\n\",\n    \"Let's jump to the code.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"%env CUDA_VISIBLE_DEVICES=0\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Setup\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Install Speedster\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install speedster\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Install Ultralytics YOLOv8\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install ultralytics\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Load YOLOv8s\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import torch\\n\",\n    \"from ultralytics import YOLO\\n\",\n    \"\\n\",\n    \"yolo = YOLO('yolov8s.pt')\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Let's load a test dummy data and see the original output\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"test_data = torch.randn(1, 3, 640, 640)\\n\",\n    \"yolo.model(test_data)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The original YOLOv8 model return as output a tuple where the first element is a tensor and the second is a list of tensors. Speedster currently supports only models that return only tensors, so we need to create a wrapper to overcome this issue:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"class YOLOWrapper(torch.nn.Module):\\n\",\n    \"    def __init__(self, yolo_model):\\n\",\n    \"        super().__init__()\\n\",\n    \"        self.model = yolo_model.model\\n\",\n    \"    \\n\",\n    \"    def forward(self, x, *args, **kwargs):\\n\",\n    \"        res = self.model(x)\\n\",\n    \"        return res[0], *tuple(res[1])\\n\",\n    \"        \\n\",\n    \"model_wrapper = YOLOWrapper(yolo)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## YOLOv8s Optimization with GPU\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can now optimize the model using speedster:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from speedster import optimize_model\\n\",\n    \"\\n\",\n    \"# Provide some input data for the model    \\n\",\n    \"input_data = [((torch.randn(1, 3, 640, 640), ), torch.tensor([0])) for i in range(100)]\\n\",\n    \"\\n\",\n    \"# Run Speedster optimization\\n\",\n    \"optimized_model = optimize_model(\\n\",\n    \"  model_wrapper, input_data=input_data, metric_drop_ths=0.1, store_latencies=True\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can finally restore the original output format by wrapping the optimized model in a new class:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"class OptimizedYOLO(torch.nn.Module):\\n\",\n    \"    def __init__(self, optimized_model):\\n\",\n    \"        super().__init__()\\n\",\n    \"        self.model = optimized_model\\n\",\n    \"    \\n\",\n    \"    def forward(self, x, *args, **kwargs):\\n\",\n    \"        res = self.model(x)\\n\",\n    \"        return res[0], list(res[1:])\\n\",\n    \"    \\n\",\n    \"optimized_wrapper = OptimizedYOLO(optimized_model)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"optimized_wrapper(test_data.cuda())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## YOLOv8s Optimization with CPU\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from speedster import optimize_model, save_model, load_model\\n\",\n    \"from ultralytics import YOLO\\n\",\n    \"\\n\",\n    \"yolo = YOLO('yolov8s.pt')\\n\",\n    \"model_wrapper = YOLOWrapper(yolo)\\n\",\n    \"\\n\",\n    \"# Provide some input data for the model    \\n\",\n    \"input_data = [((torch.randn(1, 3, 640, 640), ), torch.tensor([0])) for i in range(100)]\\n\",\n    \"\\n\",\n    \"# Run Speedster optimization\\n\",\n    \"optimized_model = optimize_model(\\n\",\n    \"  model_wrapper, input_data=input_data, metric_drop_ths=0.1, store_latencies=True, device=\\\"cpu\\\"\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"optimized_wrapper = OptimizedYOLO(optimized_model)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"optimized_wrapper(test_data)\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"b72bdf54\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Save and reload the optimized model\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"ada71f91\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can easily save to disk the optimized model with the following line:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"99b3a9d0\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"save_model(optimized_model, \\\"model_save_path\\\")\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"6308ddd7\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can then load again the model:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"f9946f6b\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"optimized_model = load_model(\\\"model_save_path\\\")\\n\",\n    \"optimized_wrapper = OptimizedYOLO(optimized_model)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"d50807de\",\n   \"metadata\": {\n    \"id\": \"d50807de\"\n   },\n   \"source\": [\n    \"What an amazing result, right?!? Stay tuned for more cool content from the Nebuly team :) \"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"b77ff2ac\",\n   \"metadata\": {\n    \"id\": \"b77ff2ac\"\n   },\n   \"source\": [\n    \"<center> \\n\",\n    \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n    \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n    \"</center>\\n\",\n    \"\\n\",\n    \"<center> \\n\",\n    \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n    \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n    \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n    \"</center>\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]\"\n  },\n  \"vscode\": {\n   \"interpreter\": {\n    \"hash\": \"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e\"\n   }\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"wQS9kNoyjsKe\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {},\n      \"source\": [\n        \"# Accelerate Fast AI ResNet34 with Speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"hBObeC3SmRwl\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the open-source library nebullvm.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"87jOeOOtktQy\"\n      },\n      \"source\": [\n        \"### Fine-tune a fast.ai model\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"XlVUVGOAlS6O\"\n      },\n      \"source\": [\n        \"For the tutorial, we will use a fast.ai notebook for beginners in which we will classify whether the input image contains a cat (True label) or a dog (False label). Let's jump to the code.\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"9cFt-FEvlNkG\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from fastai.vision.all import *\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"GqdMEBPZlmpu\",\n        \"outputId\": \"18d8a166-9b5d-4c91-cbc7-c8591bd5c0d2\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"path = untar_data(URLs.PETS)\\n\",\n        \"files = get_image_files(path/\\\"images\\\")\\n\",\n        \"\\n\",\n        \"def label_func(f): return f[0].isupper()\\n\",\n        \"\\n\",\n        \"dls = ImageDataLoaders.from_name_func(path, files, label_func, item_tfms=Resize(224), num_workers=0)\\n\",\n        \"dls.show_batch()\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"VrmI4VeZlhJG\"\n      },\n      \"source\": [\n        \"After downloading a sample of images of dogs and cats, we fine-tune the fast.ai model.\\n\",\n        \"\\n\",\n        \"\\n\",\n        \"\\n\",\n        \"\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"MJ8q9xxBlv1x\",\n        \"outputId\": \"8169f902-3dd0-449c-c293-91fb7ab94003\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"learn = cnn_learner(dls, resnet34, metrics=error_rate)\\n\",\n        \"learn.fine_tune(1)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"RBzr8_47lxsW\",\n        \"outputId\": \"b87781d6-2826-4cc6-9fd3-57da5cdcbbd4\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"valid_loss, error = learn.validate()\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"WSWq0il6l0eC\"\n      },\n      \"source\": [\n        \"Now that we have fine-tuned the model, let's calculate the time required to run a prediction as an average over 100 tests.\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"o_iMOqI_l6-Y\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import time\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"JNZXAgIYl883\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%%capture\\n\",\n        \"times = []\\n\",\n        \"for _ in range(100):\\n\",\n        \"    st = time.time()\\n\",\n        \"    preds = learn.predict(files[0])\\n\",\n        \"    times.append((time.time()-st)*1000)\\n\",\n        \"fastai_vanilla_time = sum(times)/len(times)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"colab\": {\n          \"base_uri\": \"https://localhost:8080/\"\n        },\n        \"id\": \"N9IDkfyDmADn\",\n        \"outputId\": \"0113620d-4c77-4a9f-ae1e-e64b0cb32293\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"print(f\\\"Average prediction time: {fastai_vanilla_time} ms,\\\\nPrediction: {preds}\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"hlwl87jRmBy2\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"#learn.save(\\\".\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"bes-NoZnmhyy\"\n      },\n      \"source\": [\n        \"### Install nebullvm\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"48aljCHu14-H\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install nebullvm:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"QFQh3BVr1-GO\",\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"id\": \"8a7a86b3\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"cffbfa32\",\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {},\n      \"source\": [\n        \"### Data preparation\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"zVfy0VBooG_J\"\n      },\n      \"source\": [\n        \"Now we prepare the dataset so that it can be processed by Speedster.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"RuUavpyooIBT\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"\\n\",\n        \"xs, ys = [], []\\n\",\n        \"for i, (x, y) in enumerate(dls.train):\\n\",\n        \"    if i >=100:\\n\",\n        \"        break\\n\",\n        \"    xs.append(x)\\n\",\n        \"    ys.append(y)\\n\",\n        \"xs = torch.cat(xs, dim=0)\\n\",\n        \"ys = torch.cat(ys, dim=0)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"kkVzQVmgoMQh\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"dl_nebullvm = [((x.unsqueeze(dim=0),), y.unsqueeze(0)) for x, y in zip(xs, ys)]\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"_Eb_AAeqoOUS\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"original_model = learn.model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"0siBvWcsnv49\"\n      },\n      \"source\": [\n        \"### Unconstrained without accuracy loss (thus constrained)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"ToxCH47qstn9\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import torch\\n\",\n        \"import torchvision.models as models\\n\",\n        \"from speedster import optimize_model, save_model, load_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"njoWqCSzvzpr\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Load a resnet as example\\n\",\n        \"model = original_model\\n\",\n        \"\\n\",\n        \"# Provide an input data for the model    \\n\",\n        \"input_data = dl_nebullvm\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\",\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"# x = torch.randn(1, 3, 224, 224)\\n\",\n        \"# res = optimized_model(x)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"GGRbJL6Xq6Ns\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"h75V23FSs2MZ\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"device = torch.device(\\\"cuda\\\" if torch.cuda.is_available() else \\\"cpu\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"# Set the model to eval mode and move it to the available device\\n\",\n        \"model.eval()\\n\",\n        \"model.to(device)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"R_QrrT0oq1i_\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"res_optimized = optimized_model(x)\\n\",\n        \"res_optimized\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"xtjV8pDYxIIl\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"from nebullvm.tools.benchmark import benchmark\\n\",\n        \"\\n\",\n        \"benchmark(model, input_data)\\n\",\n        \"benchmark(optimized_model, input_data)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"lWJCMGGJxaG5\"\n      },\n      \"source\": [\n        \"### Unconstrained with 2% accuracy loss\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"g9Huil4-xeX5\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Load a resnet as example\\n\",\n        \"model = original_model\\n\",\n        \"\\n\",\n        \"# Provide an input data for the model    \\n\",\n        \"input_data = dl_nebullvm\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\", metric_drop_ths=0.02, metric=\\\"accuracy\\\"\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"# x = torch.randn(1, 3, 224, 224)\\n\",\n        \"# res = optimized_model(x)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"cLxoOzxe4clI\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Set the model to eval mode and move it to the available device\\n\",\n        \"model.eval()\\n\",\n        \"model.to(device)\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"c3QvxwUD4clI\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"dRLd4QMJ4clI\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"benchmark(model, input_data)\\n\",\n        \"benchmark(optimized_model, input_data)\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"ceb60d8c\",\n      \"metadata\": {\n        \"id\": \"ceb60d8c\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"d9eda1a0\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"62b6fcbf\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"3c968d51\",\n      \"metadata\": {},\n      \"source\": [\n        \"We can then load again the model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"id\": \"c1340c49\",\n      \"metadata\": {},\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"provenance\": []\n    },\n    \"gpuClass\": \"standard\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3.8.10 64-bit\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.8.9 (default, Apr 13 2022, 08:48:06) \\n[Clang 13.1.6 (clang-1316.0.21.2.5)]\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/pytorch/Readme.md",
    "content": "# **PyTorch Optimization**\n\nThis section contains all the available notebooks that show how to leverage Speedster to optimize PyTorch models.\n\n## Notebooks:\n| Notebook                                                                                                                                                                   | Description                                                                   |                                                                                                                                                                                                                                             |\n|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [Accelerate Torchvision Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model loaded from Torchvision. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb) |\n| [Accelerate Fast AI Resnet34](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb)     | Show how to optimize with Speedster a Resnet34 model loaded from Fast AI.     | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb) |\n| [Accelerate PyTorch ViT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb)               | Show how to optimize with Speedster a PyTorch ViT model.                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb)      |\n| [Accelerate Ultralytics YOLOv5](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb)     | Show how to optimize with Speedster a YOLOv5 model from Ultralytics.          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb)   |\n| [Accelerate Ultralytics YOLOv8](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb)     | Show how to optimize with Speedster a YOLOv8 model from Ultralytics.          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb)   |\n\n\n## PyTorch API quick view:\n\n``` python\nimport torch\nimport torchvision.models as models\nfrom speedster import optimize_model\n\n# Load a resnet as example\nmodel = models.resnet50()\n\n# Provide an input data for the model    \ninput_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n  model, input_data=input_data, optimization_time=\"unconstrained\"\n)\n\n# Try the optimized model\nx = torch.randn(1, 3, 256, 256)\n\n## Warmup the model\n## This step is necessary before the latency computation of the \n## optimized model in order to get reliable results.\n# for _ in range(10):\n#   optimized_model(x)\n\nres = optimized_model(x)\n```\n"
  },
  {
    "path": "optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb",
    "content": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"p5b0PzpW1xJq\"\n      },\n      \"source\": [\n        \"![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"-KdJPm7M05Jc\"\n      },\n      \"source\": [\n        \"# Accelerate Tensorflow ResNet50 with Speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"T9xuwZEHzN2K\"\n      },\n      \"source\": [\n        \"Hi and welcome 👋\\n\",\n        \"\\n\",\n        \"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library `nebullvm`.\\n\",\n        \"\\n\",\n        \"We will\\n\",\n        \"1. Install Speedster and the deep learning compilers used by the library.\\n\",\n        \"2. Speed up a PyTorch ResNet50 without any loss of accuracy.\\n\",\n        \"3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\\n\",\n        \"\\n\",\n        \"Let's jump to the code.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"KIeIvBPVLQuq\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"%env CUDA_VISIBLE_DEVICES=0\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"HbFy2Aykz2Qo\"\n      },\n      \"source\": [\n        \"### Installation\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"48aljCHu14-H\"\n      },\n      \"source\": [\n        \"Install Speedster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"QFQh3BVr1-GO\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!pip install speedster\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"8a7a86b3\"\n      },\n      \"source\": [\n        \"Install deep learning compilers:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"cffbfa32\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"!python -m nebullvm.installers.auto_installer --frameworks tensorflow --compilers all\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"N5RXHoZl0p3p\"\n      },\n      \"source\": [\n        \"## Optimization example with Tensorflow\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"-Ju-VcRH01Mw\"\n      },\n      \"source\": [\n        \"In the following example we will try to optimize a standard resnet50 loaded directly from keras.\\n\",\n        \"\\n\",\n        \"Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\\n\",\n        \"\\n\",\n        \"Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \\\"accuracy\\\", metric_drop_ths = 0.02).\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"skxEuemn171G\"\n      },\n      \"source\": [\n        \"### Scenario 1 - No accuracy drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"wVRLXrDi2VaG\"\n      },\n      \"source\": [\n        \"First we load the model and optimize it using the Speedster API:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"2RbgGruAeQcf\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# If you encountered any error, run the cell again\\n\",\n        \"import tensorflow as tf\\n\",\n        \"from tensorflow.keras.applications.resnet50 import ResNet50\\n\",\n        \"from speedster import optimize_model, save_model, load_model\\n\",\n        \"\\n\",\n        \"# Load a resnet as example\\n\",\n        \"model = ResNet50()\\n\",\n        \"\\n\",\n        \"# Provide an input data for the model    \\n\",\n        \"input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0]))]\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\"\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"x = tf.random.normal([1, 224, 224, 3])\\n\",\n        \"res_original = model.predict(x)\\n\",\n        \"res_optimized = optimized_model.predict(x)[0]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"NGrk6_jwRubP\"\n      },\n      \"source\": [\n        \"We can print the type of the optimized model to see which compiler was faster:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"cVMn6erJLQuu\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"aT0BhdIKR7gY\"\n      },\n      \"source\": [\n        \"In our case, the optimized model type was TensorflowNvidiaInferenceLearner, so this means that Tensor RT was the faster compiler.\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"JMiuufyu2gD3\"\n      },\n      \"source\": [\n        \"After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"Swpr-Wi5Si9a\"\n      },\n      \"source\": [\n        \"First of all, let's print the results\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"MjGtKkeZSOc7\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"res_original\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"dhe94Tk3SSfn\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"res_optimized\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"UqxzStjD2v0r\"\n      },\n      \"source\": [\n        \"Then, let's compute the average latency of the baseline model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"ELyTjg6_S4Us\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import time\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"dkt67_Orwlv4\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"num_iters = 100\\n\",\n        \"\\n\",\n        \"# Warmup\\n\",\n        \"for i in range(10):\\n\",\n        \"  model.predict(x)\\n\",\n        \"\\n\",\n        \"start = time.time()\\n\",\n        \"for i in range(num_iters):\\n\",\n        \"  model.predict(x)\\n\",\n        \"stop = time.time()\\n\",\n        \"\\n\",\n        \"print(\\\"Average latency original model: {:.4f} seconds\\\".format((stop - start) / num_iters))\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"AgOv-GqQ3KIC\"\n      },\n      \"source\": [\n        \"Finally we compute the average latency for the optimized model:\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"4PodpaDVfwzT\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Warmup\\n\",\n        \"for i in range(10):\\n\",\n        \"  optimized_model.predict(x)\\n\",\n        \"\\n\",\n        \"start = time.time()\\n\",\n        \"for i in range(num_iters):\\n\",\n        \"  optimized_model.predict(x)\\n\",\n        \"stop = time.time()\\n\",\n        \"\\n\",\n        \"print(\\\"Average latency optimized model: {:.4f} seconds\\\".format((stop - start) / num_iters))\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"tBeRKNTI3iyK\"\n      },\n      \"source\": [\n        \"### Scenario 2 - Accuracy drop\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"w3wutIzfAMe_\"\n      },\n      \"source\": [\n        \"In this scenario, we set a max threshold for the accuracy drop to 2%\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"fO1nGqpj3p7z\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"import tensorflow as tf\\n\",\n        \"from tensorflow.keras.applications.resnet50 import ResNet50\\n\",\n        \"from speedster import optimize_model\\n\",\n        \"\\n\",\n        \"# Load a resnet as example\\n\",\n        \"model = ResNet50()\\n\",\n        \"\\n\",\n        \"# Provide an input data for the model   \\n\",\n        \"# Note that in this case we should provide the model at least 100 data samples\\n\",\n        \"input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for i in range(100)]\\n\",\n        \"\\n\",\n        \"# Run Speedster optimization\\n\",\n        \"optimized_model = optimize_model(\\n\",\n        \"  model, input_data=input_data, optimization_time=\\\"unconstrained\\\", metric = \\\"accuracy\\\", metric_drop_ths = 0.02\\n\",\n        \")\\n\",\n        \"\\n\",\n        \"# Try the optimized model\\n\",\n        \"x = tf.random.normal([1, 224, 224, 3])\\n\",\n        \"res_original = model.predict(x)\\n\",\n        \"res_optimized = optimized_model.predict(x)[0]\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"yfW9kmHX-pGi\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the baseline model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"0MMrL3959hli\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"num_iters = 100\\n\",\n        \"\\n\",\n        \"# Warmup\\n\",\n        \"for i in range(10):\\n\",\n        \"  model.predict(x)\\n\",\n        \"\\n\",\n        \"start = time.time()\\n\",\n        \"for i in range(num_iters):\\n\",\n        \"  model.predict(x)\\n\",\n        \"stop = time.time()\\n\",\n        \"\\n\",\n        \"print(\\\"Average latency original model: {:.4f} seconds\\\".format((stop - start) / num_iters))\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"i3GqasOM-u8f\"\n      },\n      \"source\": [\n        \"Here we compute the average throughput for the optimized model:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"_IbAW0KA4Fm5\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"# Warmup\\n\",\n        \"for i in range(10):\\n\",\n        \"  optimized_model.predict(x)\\n\",\n        \"\\n\",\n        \"start = time.time()\\n\",\n        \"for i in range(num_iters):\\n\",\n        \"  optimized_model.predict(x)\\n\",\n        \"stop = time.time()\\n\",\n        \"\\n\",\n        \"print(\\\"Average latency optimized model: {:.4f} seconds\\\".format((stop - start) / num_iters))\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"4XFMC1S6zXTU\"\n      },\n      \"source\": [\n        \"## Save and reload the optimized model\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"OXHVr3EAzbT5\"\n      },\n      \"source\": [\n        \"We can easily save to disk the optimized model with the following line:\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"3M565P-zzaFB\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"save_model(optimized_model, \\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"ee8CS_Evzg1j\"\n      },\n      \"source\": [\n        \"We can then load again the model:\\n\",\n        \"\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"execution_count\": null,\n      \"metadata\": {\n        \"id\": \"zOQ88SY_zg-A\"\n      },\n      \"outputs\": [],\n      \"source\": [\n        \"optimized_model = load_model(\\\"model_save_path\\\")\"\n      ]\n    },\n    {\n      \"attachments\": {},\n      \"cell_type\": \"markdown\",\n      \"id\": \"b77ff2ac\",\n      \"metadata\": {\n        \"id\": \"b77ff2ac\"\n      },\n      \"source\": [\n        \"<center> \\n\",\n        \"    <a href=\\\"https://discord.com/invite/RbeQMu886J\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Join the community </a> |\\n\",\n        \"    <a href=\\\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Contribute to the library </a>\\n\",\n        \"</center>\\n\",\n        \"\\n\",\n        \"<center> \\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> How speedster works </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Documentation </a> •\\n\",\n        \"    <a href=\\\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\\\" target=\\\"_blank\\\" style=\\\"text-decoration: none;\\\"> Quick start </a> \\n\",\n        \"</center>\"\n      ]\n    }\n  ],\n  \"metadata\": {\n    \"accelerator\": \"GPU\",\n    \"colab\": {\n      \"provenance\": []\n    },\n    \"gpuClass\": \"standard\",\n    \"kernelspec\": {\n      \"display_name\": \"Python 3.8.10 64-bit\",\n      \"language\": \"python\",\n      \"name\": \"python3\"\n    },\n    \"language_info\": {\n      \"codemirror_mode\": {\n        \"name\": \"ipython\",\n        \"version\": 3\n      },\n      \"file_extension\": \".py\",\n      \"mimetype\": \"text/x-python\",\n      \"name\": \"python\",\n      \"nbconvert_exporter\": \"python\",\n      \"pygments_lexer\": \"ipython3\",\n      \"version\": \"3.8.10\"\n    },\n    \"vscode\": {\n      \"interpreter\": {\n        \"hash\": \"31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6\"\n      }\n    }\n  },\n  \"nbformat\": 4,\n  \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "optimization/speedster/notebooks/tensorflow/Readme.md",
    "content": "# **Tensorflow Optimization**\n\nThis section contains all the available notebooks that show how to leverage Speedster to optimize Tensorflow models.\n\n## Notebooks:\n| Notebook                                                                                                                                                                   | Description                                                             |                                                                                                                                                                                                                                                   |\n|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| [Accelerate Keras Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model loaded from keras. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb) |\n\n\n## Tensorflow API quick view:\n\n``` python\nimport tensorflow as tf\nfrom tensorflow.keras.applications.resnet50 import ResNet50\nfrom speedster import optimize_model\n\n# Load a resnet as example\nmodel = ResNet50()\n\n# Provide an input data for the model    \ninput_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0]))]\n\n# Run Speedster optimization\noptimized_model = optimize_model(\n  model, input_data=input_data, optimization_time=\"unconstrained\"\n)\n\n# Try the optimized model\nx = tf.random.normal([1, 224, 224, 3])\nres_original = model.predict(x)\n\n## Warmup the model\n## This step is necessary before the latency computation of the \n## optimized model in order to get reliable results.\n# for _ in range(10):\n#   optimized_model.predict(x)\n\nres_optimized = optimized_model.predict(x)[0]\n```\n"
  },
  {
    "path": "optimization/speedster/requirements.txt",
    "content": "nebullvm>=0.10.0\ntabulate>=0.8.0\n"
  },
  {
    "path": "optimization/speedster/setup.py",
    "content": "from pathlib import Path\nfrom setuptools import setup, find_packages\n\n\nREQUIREMENTS = [\n    \"nebullvm>=0.9.0\",\n    \"tabulate>=0.8.0\",\n]\n\nthis_directory = Path(__file__).parent\nlong_description = (this_directory / \"README.md\").read_text(encoding=\"utf8\")\n\nsetup(\n    name=\"speedster\",\n    version=\"0.4.0\",\n    packages=find_packages(),\n    install_requires=REQUIREMENTS,\n    long_description=long_description,\n    include_package_data=True,\n    long_description_content_type=\"text/markdown\",\n)\n"
  },
  {
    "path": "optimization/speedster/speedster/__init__.py",
    "content": "from speedster.api.functions import optimize_model  # noqa: F401\nfrom nebullvm.operations.inference_learners.utils import (  # noqa: F401\n    load_model,\n    save_model,\n)\n"
  },
  {
    "path": "optimization/speedster/speedster/api/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/speedster/speedster/api/functions.py",
    "content": "import logging\nfrom typing import (\n    Union,\n    Iterable,\n    Sequence,\n    Callable,\n    Dict,\n    List,\n    Optional,\n)\n\nfrom nebullvm.config import DEFAULT_METRIC_DROP_THS\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom nebullvm.tools.logger import debug_mode_enabled, LoggingContext\n\nfrom speedster.root_op import SpeedsterRootOp\n\nfrom nebullvm.tools.utils import check_device\n\n\ndef optimize_model(\n    model: Union[torch.nn.Module, tf.Module, str],\n    input_data: Union[Iterable, Sequence],\n    metric_drop_ths: float = DEFAULT_METRIC_DROP_THS,\n    metric: Union[str, Callable] = None,\n    optimization_time: str = \"constrained\",\n    dynamic_info: Dict = None,\n    config_file: str = None,\n    ignore_compilers: List[str] = None,\n    ignore_compressors: List[str] = None,\n    store_latencies: bool = False,\n    device: Optional[str] = None,\n    **kwargs,\n):\n    \"\"\"Optimize the input model regardless of the framework it was used for\n    implementing it. The optimized model given as output will share with the\n    input one the same API, i.e. the optimized model will have the same\n    interface as the original one.\n\n    Args:\n        model (Union[torch.Module, tf.Module, str]): The input model. It can be\n            a torch or tensorflow model or a path to an onnx saved model.\n        input_data (Iterable or Sequence): Input data to be used for\n            optimizing the model. Note that if 'unconstrained' is selected as\n            `optimization_time`, it would be beneficial to provide at least 100\n            data samples in order to use all the techniques supported by\n            Nebullvm. The data can be given in either as sequence (data can be\n            accessed by \"element\", e.g. `data[i]`) or iterable (data needs to\n            be accessed with loop, e.g. `for x in data`). PyTorch, TensorFlow\n            and Onnx respectively accept input tensor in `torch.Tensor`,\n            `tf.Tensor` and `np.ndarray` formats. Note that each input\n            sample must be a tuple containing a tuple as first element, the\n            `inputs`, and the `label` as second element. The `inputs` needs to\n            be passed as tuple even if a single input is needed by the model\n            (in this case the `inputs` tuple will contain just an element).\n            HuggingFace models can take as data samples both dictionaries or\n            strings. Strings will then be converted in data samples using the\n            HuggingFace tokenizer which must be given as input when just a\n            list of string is provided as input_data (tokenizers can be passed\n            as extra arguments of this function using the keyword `tokenizer`).\n        metric_drop_ths (float, optional): Maximum reduction in the\n            selected metric accepted. No model with a higher error will be\n            accepted, i.e. all optimized model having a larger error respect to\n            the original one will be discarded, without even considering their\n            possible speed-up. Default: None, i.e. no drop in metric accepted.\n        metric (Union[Callable, str], optional): The metric to\n            be used for accepting or refusing a precision-reduction\n            optimization proposal. If none is given but a `metric_drop_ths` is\n            received, the `nebullvm.measure.compute_relative_difference`\n            metric will be used as default one. A user-defined metric can\n            be passed as function accepting as inputs two tuples of tensors\n            (produced by the baseline and the optimized model) and the related\n            original labels.\n            For more information see\n            `nebullvm.measure.compute_relative_difference` and\n            `nebullvm.measure.compute_accuracy_drop`. `metric`\n            accepts as value also a string containing the metric name. At the\n            current stage the supported metrics are `\"numeric_precision\"` and\n            `\"accuracy\"`. Default: `\"numeric_precision\"`\n        optimization_time (OptimizationTime, optional): The optimization time\n            mode. It can be either 'constrained' or 'unconstrained'. For\n            'constrained' mode just compilers and precision reduction\n            techniques are used (no compression). 'Unconstrained' optimization\n            allows the usage of more time-consuming techniques as pruning and\n            distillation. Note that for using many of the sophisticated\n            techniques in the 'unconstrained' optimization, a small fine-tuning\n            of the model will be needed. Thus we highly recommend to give as\n            input_data at least 100 samples for when selecting 'unconstrained'\n            optimization. Default: 'constrained'.\n        dynamic_info (Dict, optional): Dictionary containing info about the\n            dynamic axis. It should contain as keys both \"inputs\" and \"outputs\"\n            and as values two lists of dictionaries where each dictionary\n            represents the dynamic axis information for an input/output tensor.\n            The inner dictionary should have as key an integer, i.e. the\n            dynamic axis (considering also the batch size) and as value a\n            string giving a \"tag\" to it, e.g. \"batch_size\". Default: None\n        config_file (str, optional): Configuration file containing the\n            parameters needed for defining the CompressionStep in the pipeline.\n            Default: None.\n        ignore_compilers (List, optional): List containing the compilers to be\n            ignored during the OptimizerStep. The compiler name should be one\n            among tvm, tensor RT, openvino, onnxruntime, deepsparse, tflite,\n            bladedisc, torchscript, intel_neural_compressor. Default: None.\n        ignore_compressors (List, optional): List containing the compressors\n            to be ignored during the CompressionStep. The compiler name should\n            be one among . Default: None.\n        store_latencies (bool, optional): Parameter that allows to save the\n            latency for each compiler used by nebullvm. Default: False.\n        device (str, optional): Device used, can be 'cpu' or 'gpu'. If not\n            set, gpu will be used if available, otherwise cpu. Default: None\n\n    Returns:\n        InferenceLearner: Optimized version of the input model having the same\n            interface, imported by its original framework. For instance a\n            Pytorch model, when optimized, will return an InferenceLearner\n            object that can be call exactly as a PyTorch model (either\n            with `model.forward(input)` and `model(input)`), i.e. it will\n            take as input and it will return `torch.Tensor`s.\n    \"\"\"\n    root_op = SpeedsterRootOp()\n    device = check_device(device)\n\n    disable_log = True if not debug_mode_enabled() else False\n\n    with LoggingContext(logging.getLogger(), disabled=disable_log):\n        return root_op.to(device).execute(\n            model=model,\n            input_data=input_data,\n            metric_drop_ths=metric_drop_ths,\n            metric=metric,\n            optimization_time=optimization_time,\n            dynamic_info=dynamic_info,\n            config_file=config_file,\n            ignore_compilers=ignore_compilers,\n            ignore_compressors=ignore_compressors,\n            store_latencies=store_latencies,\n            **kwargs,\n        )\n"
  },
  {
    "path": "optimization/speedster/speedster/api/tests/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/speedster/speedster/api/tests/test_huggingface.py",
    "content": "from tempfile import TemporaryDirectory\n\nfrom nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST\nfrom nebullvm.operations.inference_learners.huggingface import (\n    HuggingFaceInferenceLearner,\n)\nfrom nebullvm.optional_modules.tensorflow import tensorflow as tf\nfrom nebullvm.optional_modules.torch import torch\nfrom transformers import AlbertModel, TFAlbertModel, AlbertTokenizer\n\nfrom speedster import optimize_model, load_model\n\n\ndef test_torch_huggingface_ort_input_text():\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n    model = AlbertModel.from_pretrained(\"albert-base-v1\")\n\n    # Move the model to gpu if available\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    model.to(device)\n    model.eval()\n\n    input_data = [\n        \"this is a test\",\n        \"hi my name is Valerio\",\n        \"india is very far from italy\",\n    ]\n\n    optimized_model = optimize_model(\n        model=model,\n        input_data=input_data,\n        optimization_time=\"constrained\",\n        tokenizer=tokenizer,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        tokenizer_args=dict(\n            add_special_tokens=True,\n            return_attention_mask=True,\n            return_tensors=\"pt\",\n            return_token_type_ids=None,  # Sets to model default\n            padding=\"longest\",\n            truncation=True,\n        ),\n    )\n\n    # save and load\n    with TemporaryDirectory() as tmp_dir:\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(loaded_model, HuggingFaceInferenceLearner)\n\n        assert isinstance(loaded_model.get_size(), int)\n\n    x = [\"this is a test input to see if the optimized model works.\"]\n    inputs = tokenizer(x, return_tensors=\"pt\").to(device)\n    model.to(device)\n    res_original = model(**inputs)\n    res_optimized = optimized_model(**inputs)\n\n    assert isinstance(optimized_model, HuggingFaceInferenceLearner)\n\n    assert (\n        torch.mean(\n            abs(\n                (\n                    res_original[\"last_hidden_state\"]\n                    - res_optimized[\"last_hidden_state\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n    assert (\n        torch.mean(\n            abs(\n                (\n                    res_original[\"pooler_output\"]\n                    - res_optimized[\"pooler_output\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n\n\ndef test_torch_huggingface_ort_input_tensors():\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n    model = AlbertModel.from_pretrained(\"albert-base-v1\")\n\n    # Move the model to gpu if available\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    model.to(device)\n    model.eval()\n\n    text = \"hi my name is Valerio\"\n    inputs = tokenizer(text, return_tensors=\"pt\").to(device)\n\n    dynamic_info = {\n        \"inputs\": [\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n        ],\n        \"outputs\": [{0: \"batch\", 1: \"num_tokens\"}, {0: \"batch\"}],\n    }\n\n    optimized_model = optimize_model(\n        model=model,\n        input_data=[inputs for _ in range(10)],\n        optimization_time=\"constrained\",\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        dynamic_info=dynamic_info,\n    )\n\n    x = [\"this is a test input to see if the optimized model works.\"]\n    inputs = tokenizer(x, return_tensors=\"pt\").to(device)\n    model.to(device)\n    res_original = model(**inputs)\n    res_optimized = optimized_model(**inputs)\n\n    assert isinstance(optimized_model, HuggingFaceInferenceLearner)\n\n    assert (\n        torch.mean(\n            abs(\n                (\n                    res_original[\"last_hidden_state\"]\n                    - res_optimized[\"last_hidden_state\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n    assert (\n        torch.mean(\n            abs(\n                (\n                    res_original[\"pooler_output\"]\n                    - res_optimized[\"pooler_output\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n\n\ndef test_torch_huggingface_torchscript_input_tensors():\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n    model = AlbertModel.from_pretrained(\"albert-base-v1\", torchscript=True)\n\n    # Move the model to gpu if available\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    model.to(device)\n    model.eval()\n\n    text = \"hi my name is Valerio\"\n    inputs = tokenizer(text, return_tensors=\"pt\").to(device)\n\n    dynamic_info = {\n        \"inputs\": [\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n        ],\n        \"outputs\": [{0: \"batch\", 1: \"num_tokens\"}, {0: \"batch\"}],\n    }\n\n    optimized_model = optimize_model(\n        model=model,\n        input_data=[inputs for _ in range(10)],\n        optimization_time=\"constrained\",\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"torchscript\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        dynamic_info=dynamic_info,\n    )\n\n    x = [\"this is a test input to see if the optimized model works.\"]\n    inputs = tokenizer(x, return_tensors=\"pt\").to(device)\n    model.to(device)\n    res_original = model(**inputs)\n    res_optimized = optimized_model(**inputs)\n\n    assert isinstance(optimized_model, HuggingFaceInferenceLearner)\n\n    assert torch.mean(abs((res_original[0] - res_optimized[0]))) < 1e-2\n    assert torch.mean(abs((res_original[1] - res_optimized[1]))) < 1e-2\n\n\ndef test_tensorflow_huggingface_ort_input_text_np():\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n    model = TFAlbertModel.from_pretrained(\"albert-base-v1\")\n\n    input_data = [\n        \"this is a test\",\n        \"hi my name is Valerio\",\n        \"india is very far from italy\",\n    ]\n\n    dynamic_info = {\n        \"inputs\": [\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n        ],\n        \"outputs\": [{0: \"batch\", 1: \"num_tokens\"}, {0: \"batch\"}],\n    }\n\n    optimized_model = optimize_model(\n        model=model,\n        input_data=input_data,\n        optimization_time=\"constrained\",\n        tokenizer=tokenizer,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        tokenizer_args=dict(\n            add_special_tokens=True,\n            return_attention_mask=True,\n            return_tensors=\"np\",\n            return_token_type_ids=None,  # Sets to model default\n            padding=\"longest\",\n            truncation=True,\n        ),\n        dynamic_info=dynamic_info,\n    )\n\n    x = [\"this is a test input to see if the optimized model works.\"]\n    inputs = tokenizer(x, return_tensors=\"np\")\n    res_original = model(**inputs)\n    res_optimized = optimized_model(**inputs)\n\n    assert isinstance(optimized_model, HuggingFaceInferenceLearner)\n\n    assert (\n        tf.math.reduce_max(\n            abs(\n                (\n                    res_original[\"last_hidden_state\"]\n                    - res_optimized[\"last_hidden_state\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n    assert (\n        tf.math.reduce_max(\n            abs(\n                (\n                    res_original[\"pooler_output\"]\n                    - res_optimized[\"pooler_output\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n\n\ndef test_tensorflow_huggingface_ort_input_tensors_np():\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n    model = TFAlbertModel.from_pretrained(\"albert-base-v1\")\n\n    text = \"hi my name is Valerio\"\n    inputs = tokenizer(text, return_tensors=\"np\")\n\n    dynamic_info = {\n        \"inputs\": [\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n        ],\n        \"outputs\": [{0: \"batch\", 1: \"num_tokens\"}, {0: \"batch\"}],\n    }\n\n    optimized_model = optimize_model(\n        model=model,\n        input_data=[inputs for _ in range(10)],\n        optimization_time=\"constrained\",\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        dynamic_info=dynamic_info,\n    )\n\n    x = [\"Test to see if it works with a different output\"]\n    inputs = tokenizer(x, return_tensors=\"np\")\n    res_original = model(**inputs)\n    res_optimized = optimized_model(**inputs)\n\n    assert isinstance(optimized_model, HuggingFaceInferenceLearner)\n\n    assert (\n        tf.math.reduce_max(\n            abs(\n                (\n                    res_original[\"last_hidden_state\"]\n                    - res_optimized[\"last_hidden_state\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n    assert (\n        tf.math.reduce_max(\n            abs(\n                (\n                    res_original[\"pooler_output\"]\n                    - res_optimized[\"pooler_output\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n\n\ndef test_tensorflow_huggingface_ort_input_text_tf():\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n    model = TFAlbertModel.from_pretrained(\"albert-base-v1\")\n\n    input_data = [\n        \"this is a test\",\n        \"hi my name is Valerio\",\n        \"india is very far from italy\",\n    ]\n\n    dynamic_info = {\n        \"inputs\": [\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n        ],\n        \"outputs\": [{0: \"batch\", 1: \"num_tokens\"}, {0: \"batch\"}],\n    }\n\n    optimized_model = optimize_model(\n        model=model,\n        input_data=input_data,\n        optimization_time=\"constrained\",\n        tokenizer=tokenizer,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        tokenizer_args=dict(\n            add_special_tokens=True,\n            return_attention_mask=True,\n            return_tensors=\"tf\",\n            return_token_type_ids=None,  # Sets to model default\n            padding=\"longest\",\n            truncation=True,\n        ),\n        dynamic_info=dynamic_info,\n    )\n\n    x = [\"this is a test input to see if the optimized model works.\"]\n    inputs = tokenizer(x, return_tensors=\"tf\")\n    res_original = model(**inputs)\n    res_optimized = optimized_model(**inputs)\n\n    assert isinstance(optimized_model, HuggingFaceInferenceLearner)\n\n    assert (\n        tf.math.reduce_max(\n            abs(\n                (\n                    res_original[\"last_hidden_state\"]\n                    - res_optimized[\"last_hidden_state\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n    assert (\n        tf.math.reduce_max(\n            abs(\n                (\n                    res_original[\"pooler_output\"]\n                    - res_optimized[\"pooler_output\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n\n\ndef test_tensorflow_huggingface_ort_input_tensors_tf():\n    tokenizer = AlbertTokenizer.from_pretrained(\"albert-base-v1\")\n    model = TFAlbertModel.from_pretrained(\"albert-base-v1\")\n\n    text = \"hi my name is Valerio\"\n    inputs = tokenizer(text, return_tensors=\"tf\")\n\n    dynamic_info = {\n        \"inputs\": [\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n            {0: \"batch\", 1: \"num_tokens\"},\n        ],\n        \"outputs\": [{0: \"batch\", 1: \"num_tokens\"}, {0: \"batch\"}],\n    }\n\n    optimized_model = optimize_model(\n        model=model,\n        input_data=[inputs for _ in range(10)],\n        optimization_time=\"constrained\",\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        dynamic_info=dynamic_info,\n    )\n\n    x = [\"Test to see if it works with a different output\"]\n    inputs = tokenizer(x, return_tensors=\"tf\")\n    res_original = model(**inputs)\n    res_optimized = optimized_model(**inputs)\n\n    assert isinstance(optimized_model, HuggingFaceInferenceLearner)\n\n    assert (\n        tf.math.reduce_max(\n            abs(\n                (\n                    res_original[\"last_hidden_state\"]\n                    - res_optimized[\"last_hidden_state\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n    assert (\n        tf.math.reduce_max(\n            abs(\n                (\n                    res_original[\"pooler_output\"]\n                    - res_optimized[\"pooler_output\"]\n                )\n            )\n        )\n        < 1e-2\n    )\n"
  },
  {
    "path": "optimization/speedster/speedster/api/tests/test_onnx.py",
    "content": "import cpuinfo\nfrom tempfile import TemporaryDirectory\n\nimport numpy as np\nimport pytest\nimport torch\nfrom nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST\nfrom nebullvm.operations.inference_learners.onnx import (\n    NumpyONNXInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.openvino import (\n    NumpyOpenVinoInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tensor_rt import (\n    NumpyONNXTensorRTInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tvm import (\n    NumpyApacheTVMInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import tvm_is_available\nfrom torchvision import models\n\nfrom speedster import optimize_model, load_model\nfrom speedster.api.tests.utils import torch_to_onnx\n\n\ndef test_onnx_ort():\n    with TemporaryDirectory() as tmp_dir:\n        model = models.resnet18()\n        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n        model_path = torch_to_onnx(model, input_data, tmp_dir)\n\n        input_data = [\n            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)\n            for i in range(100)\n        ]\n\n        # Run nebullvm optimization in one line of code\n        optimized_model = optimize_model(\n            model_path,\n            input_data=input_data,\n            ignore_compilers=[\n                compiler\n                for compiler in COMPILER_LIST\n                if compiler != \"onnxruntime\"\n            ],\n            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        )\n\n        with TemporaryDirectory() as tmp_dir:\n            optimized_model.save(tmp_dir)\n            loaded_model = load_model(tmp_dir)\n            assert isinstance(loaded_model, NumpyONNXInferenceLearner)\n\n            assert isinstance(loaded_model.get_size(), int)\n\n            # Try the optimized model\n            device = torch.device(\n                \"cuda\" if torch.cuda.is_available() else \"cpu\"\n            )\n            x = torch.randn(1, 3, 256, 256, requires_grad=False)\n            model.to(device).eval()\n            with torch.inference_mode():\n                res_original = model(x.to(device))\n            res_optimized = optimized_model(x.numpy())[0]\n\n            assert (\n                abs(\n                    (res_original.detach().cpu().numpy() - res_optimized)\n                ).max()\n                < 1e-2\n            )\n\n\ndef test_onnx_ort_quant():\n    with TemporaryDirectory() as tmp_dir:\n        model = models.resnet18()\n        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n        model_path = torch_to_onnx(model, input_data, tmp_dir)\n\n        input_data = [\n            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)\n            for i in range(100)\n        ]\n\n        # Run nebullvm optimization in one line of code\n        optimized_model = optimize_model(\n            model_path,\n            input_data=input_data,\n            ignore_compilers=[\n                compiler\n                for compiler in COMPILER_LIST\n                if compiler != \"onnxruntime\"\n            ],\n            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n            metric_drop_ths=2,\n        )\n\n        # Try the optimized model\n        device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n        model.to(device).eval()\n        x = torch.randn(1, 3, 256, 256, requires_grad=False)\n        with torch.inference_mode():\n            res_original = model(x.to(device))\n        res_optimized = optimized_model(x.numpy())[0]\n\n        assert isinstance(optimized_model, NumpyONNXInferenceLearner)\n        assert (\n            abs((res_original.detach().cpu().numpy() - res_optimized)).max()\n            < 1\n        )\n\n\n@pytest.mark.skipif(\n    not torch.cuda.is_available(),\n    reason=\"Skip because cuda is not available.\",\n)\ndef test_onnx_tensorrt():\n    with TemporaryDirectory() as tmp_dir:\n        model = models.resnet18()\n        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n        model_path = torch_to_onnx(model, input_data, tmp_dir)\n\n        input_data = [\n            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)\n            for i in range(100)\n        ]\n\n        # Run nebullvm optimization in one line of code\n        optimized_model = optimize_model(\n            model_path,\n            input_data=input_data,\n            ignore_compilers=[\n                compiler\n                for compiler in COMPILER_LIST\n                if compiler != \"tensor_rt\"\n            ],\n            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        )\n\n        # Try the optimized model\n        device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n        x = torch.randn(1, 3, 256, 256, requires_grad=False)\n        model.to(device).eval()\n        with torch.inference_mode():\n            res_original = model(x.to(device))\n        res_optimized = optimized_model(x.numpy())[0]\n\n        assert isinstance(optimized_model, NumpyONNXTensorRTInferenceLearner)\n        assert (\n            abs((res_original.detach().cpu().numpy() - res_optimized)).max()\n            < 1e-2\n        )\n\n\n@pytest.mark.skipif(\n    \"intel\" not in cpuinfo.get_cpu_info()[\"brand_raw\"].lower(),\n    reason=\"Openvino is only available for intel processors.\",\n)\ndef test_onnx_openvino():\n    with TemporaryDirectory() as tmp_dir:\n        model = models.resnet18()\n        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n        model_path = torch_to_onnx(model, input_data, tmp_dir)\n\n        input_data = [\n            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)\n            for i in range(100)\n        ]\n\n        # Run nebullvm optimization in one line of code\n        optimized_model = optimize_model(\n            model_path,\n            input_data=input_data,\n            ignore_compilers=[\n                compiler\n                for compiler in COMPILER_LIST\n                if compiler != \"openvino\"\n            ],\n            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n            device=\"cpu\",\n        )\n\n        # Try the optimized model\n        device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n        x = torch.randn(1, 3, 256, 256, requires_grad=False)\n        model.to(device).eval()\n        with torch.inference_mode():\n            res_original = model(x.to(device))\n        res_optimized = optimized_model(x.numpy())[0]\n\n        assert isinstance(optimized_model, NumpyOpenVinoInferenceLearner)\n        assert (\n            abs((res_original.detach().cpu().numpy() - res_optimized)).max()\n            < 1e-2\n        )\n\n\n@pytest.mark.skipif(\n    not tvm_is_available(), reason=\"Can't test tvm if it's not installed.\"\n)\ndef test_onnx_tvm():\n    with TemporaryDirectory() as tmp_dir:\n        model = models.resnet18()\n        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n        model_path = torch_to_onnx(model, input_data, tmp_dir)\n\n        input_data = [\n            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)\n            for i in range(100)\n        ]\n\n        # Run nebullvm optimization in one line of code\n        optimized_model = optimize_model(\n            model_path,\n            input_data=input_data,\n            ignore_compilers=[\n                compiler for compiler in COMPILER_LIST if compiler != \"tvm\"\n            ],\n            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        )\n\n        # Try the optimized model\n        device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n        x = torch.randn(1, 3, 256, 256, requires_grad=False)\n        model.to(device).eval()\n        with torch.inference_mode():\n            res_original = model(x.to(device))\n        res_optimized = optimized_model(x.numpy())[0]\n\n        assert isinstance(optimized_model, NumpyApacheTVMInferenceLearner)\n        assert (\n            abs((res_original.detach().cpu().numpy() - res_optimized)).max()\n            < 1e-2\n        )\n"
  },
  {
    "path": "optimization/speedster/speedster/api/tests/test_pytorch.py",
    "content": "import cpuinfo\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\nimport torchvision.models as models\nfrom nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST\nfrom nebullvm.operations.inference_learners.blade_disc import (\n    BladeDISCInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.onnx import (\n    PytorchONNXInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.openvino import (\n    PytorchOpenVinoInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tensor_rt import (\n    PytorchTensorRTInferenceLearner,\n    PytorchONNXTensorRTInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.torch_dynamo import (\n    TorchDynamoInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.torchscript import (\n    TorchScriptInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tvm import (\n    PytorchApacheTVMInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import (\n    tvm_is_available,\n    bladedisc_is_available,\n)\n\nfrom speedster import optimize_model, load_model\n\nfrom nebullvm.tools.utils import check_module_version\n\n\ndef test_torch_ort():\n    model = models.resnet18()\n    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    with TemporaryDirectory() as tmp_dir:\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(loaded_model, PytorchONNXInferenceLearner)\n\n        assert isinstance(loaded_model.get_size(), int)\n\n    # Try the optimized model\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)\n    model.to(device).eval()\n    res_original = model(x)\n    res_optimized = optimized_model(x)[0]\n\n    assert isinstance(optimized_model, PytorchONNXInferenceLearner)\n    assert torch.max(abs((res_original - res_optimized))) < 1e-2\n\n\ndef test_torch_ort_quant():\n    model = models.resnet18()\n    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        metric_drop_ths=2,\n    )\n\n    # Try the optimized model\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)\n    model.to(device).eval()\n    res_original = model(x)\n    res_optimized = optimized_model(x)[0]\n\n    assert isinstance(optimized_model, PytorchONNXInferenceLearner)\n    assert torch.max(abs((res_original - res_optimized))) < 2\n\n\ndef test_torch_torchscript():\n    model = models.resnet18()\n    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"torchscript\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    # Try the optimized model\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)\n    model.to(device).eval()\n    res_original = model(x)\n    res_optimized = optimized_model(x)[0]\n\n    assert isinstance(optimized_model, TorchScriptInferenceLearner)\n    assert torch.max(abs((res_original - res_optimized))) < 1e-2\n\n\n@pytest.mark.skipif(\n    not check_module_version(torch, min_version=\"2.0.0\") or True,\n    reason=\"Torch version is not supported\",\n)\ndef test_torch_torch_dynamo():\n    model = models.resnet18()\n    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler\n            for compiler in COMPILER_LIST\n            if compiler != \"torch_dynamo\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    # Try the optimized model\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)\n    model.to(device).eval()\n    res_original = model(x)\n    res_optimized = optimized_model(x)[0]\n\n    assert isinstance(optimized_model, TorchDynamoInferenceLearner)\n    assert torch.max(abs((res_original - res_optimized))) < 1e-2\n\n\n@pytest.mark.skipif(\n    not torch.cuda.is_available(),\n    reason=\"Skip because cuda is not available.\",\n)\n@pytest.mark.skipif(\n    not check_module_version(torch, max_version=\"1.13.1+cu117\"),\n    reason=\"Skip because torch version is not supported.\",\n)\ndef test_torch_tensorrt():\n    model = models.resnet18()\n    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"tensor_rt\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    # Try the optimized model\n    x = torch.randn(1, 3, 256, 256).cuda()\n    model.cuda().eval()\n    res_original = model(x)\n    res_optimized = optimized_model(x)[0]\n\n    assert isinstance(\n        optimized_model, PytorchTensorRTInferenceLearner\n    ) or isinstance(optimized_model, PytorchONNXTensorRTInferenceLearner)\n    assert torch.max(abs((res_original - res_optimized))) < 1e-2\n\n\n@pytest.mark.skipif(\n    \"intel\" not in cpuinfo.get_cpu_info()[\"brand_raw\"].lower(),\n    reason=\"Openvino is only available for intel processors.\",\n)\ndef test_torch_openvino():\n    model = models.resnet18()\n    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"openvino\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        device=\"cpu\",\n    )\n\n    # Try the optimized model\n    x = torch.randn(1, 3, 256, 256)\n    model.eval()\n    res_original = model(x)\n    res_optimized = optimized_model(x)[0]\n\n    assert isinstance(optimized_model, PytorchOpenVinoInferenceLearner)\n    assert torch.max(abs((res_original.cpu() - res_optimized))) < 1e-2\n\n\n@pytest.mark.skipif(\n    not tvm_is_available(), reason=\"Can't test tvm if it's not installed.\"\n)\ndef test_torch_tvm():\n    model = models.resnet18()\n    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"tvm\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    # Try the optimized model\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)\n    model.to(device).eval()\n    res_original = model(x)\n    res_optimized = optimized_model(x)[0]\n\n    assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)\n    assert torch.max(abs((res_original - res_optimized))) < 1e-2\n\n\n@pytest.mark.skipif(\n    not bladedisc_is_available(),\n    reason=\"Can't test bladedisc if it's not installed.\",\n)\ndef test_torch_bladedisc():\n    model = models.resnet18()\n    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"bladedisc\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    # Try the optimized model\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)\n    model.to(device).eval()\n    res_original = model(x)\n    res_optimized = optimized_model(x)[0]\n\n    assert isinstance(optimized_model, BladeDISCInferenceLearner)\n    assert torch.max(abs((res_original - res_optimized))) < 1e-2\n"
  },
  {
    "path": "optimization/speedster/speedster/api/tests/test_tensorflow.py",
    "content": "from tempfile import TemporaryDirectory\n\nimport cpuinfo\nimport pytest\nimport tensorflow as tf\nfrom keras.applications import ResNet50\nfrom nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST\nfrom nebullvm.operations.inference_learners.onnx import (\n    TensorflowONNXInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.openvino import (\n    TensorflowOpenVinoInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tensor_rt import (\n    TensorflowONNXTensorRTInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tensorflow import (\n    TensorflowBackendInferenceLearner,\n    TFLiteBackendInferenceLearner,\n)\nfrom nebullvm.operations.inference_learners.tvm import (\n    TensorflowApacheTVMInferenceLearner,\n)\nfrom nebullvm.operations.optimizations.compilers.utils import tvm_is_available\nfrom nebullvm.tools.utils import gpu_is_available\n\nfrom speedster import optimize_model, load_model\n\n# Limit tensorflow gpu memory usage\ngpus = tf.config.list_physical_devices(\"GPU\")\nif gpus:\n    try:\n        # Currently, memory growth needs to be the same across GPUs\n        for gpu in gpus:\n            tf.config.set_visible_devices(gpus[0], \"GPU\")\n            tf.config.experimental.set_memory_growth(gpu, True)\n            logical_gpus = tf.config.list_logical_devices(\"GPU\")\n            print(\n                len(gpus), \"Physical GPUs,\", len(logical_gpus), \"Logical GPUs\"\n            )\n    except RuntimeError as e:\n        # Memory growth must be set before GPUs have been initialized\n        print(e)\n\n\ndef test_tensorflow_ort():\n    model = ResNet50()\n    input_data = [\n        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)\n    ]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"onnxruntime\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    with TemporaryDirectory() as tmp_dir:\n        optimized_model.save(tmp_dir)\n        loaded_model = load_model(tmp_dir)\n        assert isinstance(loaded_model, TensorflowONNXInferenceLearner)\n\n        assert isinstance(loaded_model.get_size(), int)\n\n    # Try the optimized model\n    x = tf.random.normal([1, 224, 224, 3])\n    res_original = model.predict(x)\n    res_optimized = optimized_model.predict(x)[0]\n\n    assert isinstance(optimized_model, TensorflowONNXInferenceLearner)\n    assert abs((res_original - res_optimized)).max() < 1e-2\n\n\ndef test_tensorflow_tf_backend():\n    model = ResNet50()\n    input_data = [\n        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)\n    ]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"xla\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    # Try the optimized model\n    x = tf.random.normal([1, 224, 224, 3])\n    res_original = model.predict(x)\n    res_optimized = optimized_model.predict(x)[0]\n\n    assert isinstance(optimized_model, TensorflowBackendInferenceLearner)\n    assert abs((res_original - res_optimized)).max() < 1e-2\n\n\n@pytest.mark.skipif(\n    gpu_is_available(),\n    reason=\"TFLite does not support Nvidia GPUs\",\n)\ndef test_tensorflow_tflite():\n    model = ResNet50()\n    input_data = [\n        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)\n    ]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"tflite\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        metric_drop_ths=0.1,\n    )\n\n    # Try the optimized model\n    x = tf.random.normal([1, 224, 224, 3])\n    res_original = model.predict(x)\n    res_optimized = optimized_model.predict(x)[0]\n\n    assert isinstance(optimized_model, TFLiteBackendInferenceLearner)\n    assert abs((res_original - res_optimized)).max() < 1e-2\n\n\n@pytest.mark.skipif(\n    not gpu_is_available(),\n    reason=\"Skip because cuda is not available.\",\n)\ndef test_tensorflow_tensorrt():\n    model = ResNet50()\n    input_data = [\n        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)\n    ]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"tensor_rt\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    # Try the optimized model\n    x = tf.random.normal([1, 224, 224, 3])\n    res_original = model.predict(x)\n    res_optimized = optimized_model.predict(x)[0]\n\n    assert isinstance(optimized_model, TensorflowONNXTensorRTInferenceLearner)\n    assert abs((res_original - res_optimized)).max() < 1e-2\n\n\n@pytest.mark.skipif(\n    \"intel\" not in cpuinfo.get_cpu_info()[\"brand_raw\"].lower(),\n    reason=\"Openvino is only available for intel processors.\",\n)\ndef test_tensorflow_openvino():\n    model = ResNet50()\n    input_data = [\n        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)\n    ]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"openvino\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n        device=\"cpu\",\n    )\n\n    # Try the optimized model\n    x = tf.random.normal([1, 224, 224, 3])\n    res_original = model.predict(x)\n    res_optimized = optimized_model.predict(x)[0]\n\n    assert isinstance(optimized_model, TensorflowOpenVinoInferenceLearner)\n    assert abs((res_original - res_optimized)).max() < 1e-2\n\n\n@pytest.mark.skipif(\n    not tvm_is_available(), reason=\"Can't test tvm if it's not installed.\"\n)\ndef test_tensorflow_tvm():\n    model = ResNet50()\n    input_data = [\n        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)\n    ]\n\n    # Run nebullvm optimization in one line of code\n    optimized_model = optimize_model(\n        model,\n        input_data=input_data,\n        ignore_compilers=[\n            compiler for compiler in COMPILER_LIST if compiler != \"tvm\"\n        ],\n        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],\n    )\n\n    # Try the optimized model\n    x = tf.random.normal([1, 224, 224, 3])\n    res_original = model.predict(x)\n    res_optimized = optimized_model.predict(x)[0]\n\n    assert isinstance(optimized_model, TensorflowApacheTVMInferenceLearner)\n    assert abs((res_original - res_optimized)).max() < 1e-2\n"
  },
  {
    "path": "optimization/speedster/speedster/api/tests/utils.py",
    "content": "import os\nfrom pathlib import Path\n\nfrom nebullvm.core.models import ModelParams, Device, DeviceType\nfrom nebullvm.operations.conversions.pytorch import convert_torch_to_onnx\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.utils import gpu_is_available\n\n\ndef torch_to_onnx(model, input_data, output_path):\n    model_params = ModelParams(1, [], [], [])\n    output_path = os.path.join(output_path, \"model.onnx\")\n    device = Device(DeviceType.GPU if gpu_is_available() else DeviceType.CPU)\n    convert_torch_to_onnx(\n        model, DataManager(input_data), model_params, Path(output_path), device\n    )\n\n    return output_path\n"
  },
  {
    "path": "optimization/speedster/speedster/root_op.py",
    "content": "import json\nimport pickle\nimport sys\nfrom typing import (\n    Any,\n    Union,\n    Iterable,\n    Sequence,\n    Dict,\n    Callable,\n    List,\n)\n\nfrom loguru import logger\nfrom nebullvm import setup_logger\nfrom nebullvm.config import MIN_NUMBER\nfrom nebullvm.core.models import OptimizeInferenceResult, DeviceType\nfrom nebullvm.operations.base import Operation\nfrom nebullvm.operations.optimizations.optimize_inference import (\n    OptimizeInferenceOp,\n)\nfrom nebullvm.tools.data import DataManager\nfrom nebullvm.tools.feedback_collector import FeedbackCollector\nfrom tabulate import tabulate\n\nfrom nebullvm.tools.hardware_utils import get_hw_setup\nfrom nebullvm.tools.utils import (\n    get_model_size_mb,\n    get_model_name,\n    generate_model_id,\n)\n\nSPEEDSTER_FEEDBACK_COLLECTOR = FeedbackCollector(\n    url=\"https://nebuly.cloud/v1/store_speedster_results\",\n    disable_telemetry_environ_var=\"SPEEDSTER_DISABLE_TELEMETRY\",\n    app_version=\"0.4.0\",\n)\n\n\ndef _convert_technique(technique: str):\n    if technique.lower() == \"none\":  # use fp32 instead of none\n        technique = \"fp32\"\n    elif technique == \"HALF\":\n        technique = \"fp16\"\n    elif technique == \"STATIC\":\n        technique = \"int8\"\n    else:\n        technique = \"int8_dynamic\"\n    return technique\n\n\ndef _get_model_len(model: Any):\n    try:\n        return len(pickle.dumps(model, -1))\n    except Exception:\n        logger.warning(\n            \"Cannot pickle input model. Unable to \"\n            \"extract original model size\"\n        )\n        # Model is not pickable\n        return -1\n\n\nclass SpeedsterRootOp(Operation):\n    def __init__(self):\n        super().__init__()\n        self.optimize_inference_op = OptimizeInferenceOp()\n        self.set_feedback_collector(SPEEDSTER_FEEDBACK_COLLECTOR)\n\n    def _send_feedback(\n        self,\n        optimization_result: OptimizeInferenceResult,\n        store_latencies: bool = False,\n    ):\n        model_orig = optimization_result.original_model.model\n        model_name = get_model_name(model_orig)\n        model_info = {\n            \"model_name\": model_name,\n            \"model_size\": f\"{get_model_size_mb(model_orig)} MB\",\n            \"framework\": optimization_result.original_model.framework.value,\n        }\n        self.feedback_collector.store_info(\n            key=\"model_id\", value=generate_model_id(model_orig)\n        )\n        self.feedback_collector.store_info(\n            key=\"model_metadata\", value=model_info\n        )\n        self.feedback_collector.store_info(\n            key=\"hardware_setup\", value=get_hw_setup(self.device).__dict__\n        )\n        optimizations = self.feedback_collector.get(\"optimizations\")\n        original_model_dict = {\n            \"compiler\": optimization_result.original_model.framework.value,\n            \"technique\": \"original\",\n            \"latency\": optimization_result.original_model.latency_seconds,\n        }\n        optimizations.insert(0, original_model_dict)\n        self.feedback_collector.send_feedback()\n\n        if store_latencies:\n            model_id = self.feedback_collector.get(\"model_id\", \"\")\n            with open(\n                f\"{model_name}_latencies_{model_id[:10]}.json\", \"w\"\n            ) as f:\n                json.dump(\n                    {\n                        \"optimizations\": optimizations,\n                    },\n                    f,\n                )\n        self.feedback_collector.reset(\"optimizations\")\n        self.feedback_collector.reset(\"model_id\")\n        self.feedback_collector.reset(\"model_metadata\")\n\n    def execute(\n        self,\n        model: Any,\n        input_data: Union[Iterable, Sequence, DataManager],\n        metric_drop_ths: float = None,\n        metric: Union[str, Callable] = None,\n        optimization_time: str = \"constrained\",\n        dynamic_info: Dict = None,\n        config_file: str = None,\n        ignore_compilers: List[str] = None,\n        ignore_compressors: List[str] = None,\n        store_latencies: bool = False,\n        **kwargs,\n    ):\n        self.logger.info(\n            \"Running Speedster on {}{}\".format(\n                self.device.type.name,\n                f\":{self.device.idx}\"\n                if self.device.type is not DeviceType.CPU\n                else \"\",\n            )\n        )\n\n        result = self.optimize_inference_op.to(self.device).execute(\n            model=model,\n            input_data=input_data,\n            metric_drop_ths=metric_drop_ths,\n            metric=metric,\n            optimization_time=optimization_time,\n            dynamic_info=dynamic_info,\n            config_file=config_file,\n            ignore_compilers=ignore_compilers,\n            ignore_compressors=ignore_compressors,\n            store_latencies=store_latencies,\n            **kwargs,\n        )\n\n        if result.optimized_model is None:\n            return None\n\n        opt_metric_drop = (\n            f\"{result.metric_drop:.4f}\"\n            if result.metric_drop > MIN_NUMBER\n            else \"0\"\n        )\n\n        self._send_feedback(result, store_latencies=store_latencies)\n\n        table = [\n            [\n                \"backend\",\n                result.original_model.framework.name,\n                result.optimized_model.inference_learner.name,\n                \"\",\n            ],\n            [\n                \"latency\",\n                f\"{result.original_model.latency_seconds:.4f} sec/batch\",\n                f\"{result.optimized_model.latency_seconds:.4f} sec/batch\",\n                f\"{result.original_model.latency_seconds / result.optimized_model.latency_seconds:.2f}x\",  # noqa: E501\n            ],\n            [\n                \"throughput\",\n                f\"{result.original_model.throughput:.2f} \" f\"data/sec\",\n                f\"{result.optimized_model.throughput:.2f} \" f\"data/sec\",\n                f\"{result.optimized_model.throughput / result.original_model.throughput:.2f}x\",  # noqa: E501\n            ],\n            [\n                \"model size\",\n                f\"{result.original_model.size_mb:.2f} MB\",\n                f\"{result.optimized_model.size_mb:.2f} MB\",\n                f\"{min(int((result.optimized_model.size_mb-result.original_model.size_mb) / result.original_model.size_mb * 100), 0)}%\"  # noqa: E501\n                if result.original_model.size_mb > 0\n                else \"NA\",\n            ],\n            [\"metric drop\", \"\", opt_metric_drop, \"\"],\n            [\n                \"techniques\",\n                \"\",\n                f\"{_convert_technique(result.optimized_model.technique)}\",\n                \"\",\n            ],\n        ]\n        headers = [\n            \"Metric\",\n            \"Original Model\",\n            \"Optimized Model\",\n            \"Improvement\",\n        ]\n\n        # change format to the logger, avoiding printing verbose info\n        # to the console (as date, time, etc.)\n        self.logger.remove()\n        handler_id = self.logger.add(\n            sys.stdout, format=\"<level>{message}</level>\"\n        )\n        hw_info = get_hw_setup(self.device)\n        hw_name = (\n            hw_info.cpu\n            if self.device.type is DeviceType.CPU\n            else hw_info.accelerator\n        )\n        self.logger.info(\n            (\n                f\"\\n[Speedster results on {hw_name}]\\n\"\n                f\"{tabulate(table, headers, tablefmt='heavy_outline')}\"\n            )\n        )\n\n        if (\n            result.original_model.latency_seconds\n            / result.optimized_model.latency_seconds\n            < 2\n        ):\n            self.logger.warning(\n                f\"\\nMax speed-up with your input parameters is \"\n                f\"{result.original_model.latency_seconds / result.optimized_model.latency_seconds:.2f}x. \"  # noqa: E501\n                f\"If you want to get a faster optimized model, \"\n                f\"see the following link for some suggestions: \"\n                f\"https://docs.nebuly.com/Speedster/advanced_\"\n                f\"options/#acceleration-suggestions\\n\"\n            )\n\n        self.logger.remove(handler_id)\n        setup_logger()\n\n        return result.optimized_model.inference_learner\n"
  },
  {
    "path": "optimization/speedster/speedster/speedster.py",
    "content": "from nebullvm.apps.base import App\n\nfrom speedster.root_op import SpeedsterRootOp\n\n\nclass SpeedsterApp(App):\n    def __init__(self):\n        super().__init__()\n        self.root_op = SpeedsterRootOp()\n\n    def execute(self, *args, **kwargs):\n        return self.root_op.execute(*args, **kwargs)\n"
  },
  {
    "path": "optimization/speedster/speedster/tests/__init__.py",
    "content": ""
  },
  {
    "path": "optimization/speedster/speedster/tests/test_root_op.py",
    "content": "from nebullvm.core.models import OptimizeInferenceResult\n\nfrom speedster.root_op import SpeedsterRootOp\n\n\ndef test_root_op_no_optim_model(mocker):\n    root_op = SpeedsterRootOp()\n\n    mocker.patch.object(\n        root_op.optimize_inference_op,\n        \"execute\",\n        return_value=OptimizeInferenceResult(\n            original_model=mocker.MagicMock(),\n            optimized_model=None,\n            hardware_setup=mocker.MagicMock(),\n        ),\n    )\n\n    res = root_op.execute(\n        model=None,\n        input_data=mocker.MagicMock(),\n        metric_drop_ths=None,\n        metric=\"latency\",\n        optimization_time=mocker.MagicMock(),\n        dynamic_info=None,\n        config_file=None,\n        ignore_compilers=None,\n        ignore_compressors=None,\n        store_latencies=False,\n    )\n\n    assert res is None\n\n\ndef test_root_op_optim_model(mocker):\n    root_op = SpeedsterRootOp()\n\n    mocker.patch.object(\n        root_op.optimize_inference_op,\n        \"execute\",\n        return_value=OptimizeInferenceResult(\n            original_model=mocker.MagicMock(\n                latency_seconds=1, throughput=1, size_mb=1\n            ),\n            optimized_model=mocker.MagicMock(\n                metric_drop=0.1, latency_seconds=1, size_mb=1, throughput=1\n            ),\n            hardware_setup=mocker.MagicMock(),\n        ),\n    )\n\n    mocker.patch.object(root_op, \"_send_feedback\")\n\n    res = root_op.execute(\n        model=None,\n        input_data=mocker.MagicMock(),\n        metric_drop_ths=None,\n        metric=\"latency\",\n        optimization_time=mocker.MagicMock(),\n        dynamic_info=None,\n        config_file=None,\n        ignore_compilers=None,\n        ignore_compressors=None,\n        store_latencies=False,\n    )\n\n    assert res is not None\n"
  },
  {
    "path": "optimization/speedster/speedster/utils.py",
    "content": ""
  },
  {
    "path": "optimization/speedster/speedster.toml",
    "content": "[build-system]\nrequires = [\n    \"setuptools>=42\",\n    \"wheel\"\n]\nbuild-backend = \"setuptools.build_meta\""
  }
]