[
  {
    "path": ".gitignore",
    "content": "__pycache__/\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/pycqa/isort\n    rev: 5.12.0\n    hooks:\n      - id: isort\n        name: isort (python)\n  - repo: https://github.com/psf/black\n    rev: 23.1.0\n    hooks:\n      - id: black\n        args: [--line-length=119,--target-version=py35]\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM nvidia/cuda:11.6.1-devel-ubi8 as base\n\nRUN dnf install -y --disableplugin=subscription-manager make git && dnf clean all --disableplugin=subscription-manager\n\n# taken form pytorch's dockerfile\nRUN curl -L -o ./miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \\\n    chmod +x ./miniconda.sh && \\\n    ./miniconda.sh -b -p /opt/conda && \\\n    rm ./miniconda.sh\n\nENV PYTHON_VERSION=3.9 \\\n    PATH=/opt/conda/envs/inference/bin:/opt/conda/bin:${PATH}\n\n# create conda env\nRUN conda create -n inference python=${PYTHON_VERSION} pip -y\n\n# change shell to activate env\nSHELL [\"conda\", \"run\", \"-n\", \"inference\", \"/bin/bash\", \"-c\"]\n\nFROM base as conda\n\n# update conda\nRUN conda update -n base -c defaults conda -y\n# cmake\nRUN conda install -c anaconda cmake -y\n\n# necessary stuff\nRUN pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 \\\n    transformers==4.26.1 \\\n    deepspeed==0.7.6 \\\n    accelerate==0.16.0 \\\n    gunicorn==20.1.0 \\\n    flask \\\n    flask_api \\\n    fastapi==0.89.1 \\\n    uvicorn==0.19.0 \\\n    jinja2==3.1.2 \\\n    pydantic==1.10.2 \\\n    huggingface_hub==0.12.1 \\\n\tgrpcio-tools==1.50.0 \\\n    --no-cache-dir\n\n# clean conda env\nRUN conda clean -ya\n\n# change this as you like 🤗\nENV TRANSFORMERS_CACHE=/cos/HF_cache \\\n    HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE}\n\nFROM conda as app\n\nWORKDIR /src\nRUN chmod -R g+w /src\n\nRUN mkdir /.cache && \\\n    chmod -R g+w /.cache\n\nENV PORT=5000 \\\n    UI_PORT=5001\nEXPOSE ${PORT}\nEXPOSE ${UI_PORT}\n\nCMD git clone https://github.com/huggingface/transformers-bloom-inference.git && \\\n    cd transformers-bloom-inference && \\\n    # install grpc and compile protos\n    make gen-proto && \\\n    make bloom-560m\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "Makefile",
    "content": "gen-proto:\n\tmkdir -p inference_server/model_handler/grpc_utils/pb\n\n\tpython -m grpc_tools.protoc -Iinference_server/model_handler/grpc_utils/proto --python_out=inference_server/model_handler/grpc_utils/pb --grpc_python_out=inference_server/model_handler/grpc_utils/pb inference_server/model_handler/grpc_utils/proto/generation.proto\n\n\tfind inference_server/model_handler/grpc_utils/pb/ -type f -name \"*.py\" -print0 -exec sed -i -e 's/^\\(import.*pb2\\)/from . \\1/g' {} \\;\n\n\ttouch inference_server/model_handler/grpc_utils/__init__.py\n\ttouch inference_server/model_handler/grpc_utils/pb/__init__.py\n\n\trm -rf inference_server/model_handler/grpc_utils/pb/*.py-e\n\nui:\n\tpython -m ui --ui_host 127.0.0.1 --ui_port 5001 --generation_backend_host 127.0.0.1 --generation_backend_port 5000 &\n\n# ------------------------- DS inference -------------------------\nbloom-176b:\n\tmake ui\n\n\tTOKENIZERS_PARALLELISM=false \\\n\tMODEL_NAME=bigscience/bloom \\\n\tMODEL_CLASS=AutoModelForCausalLM \\\n\tDEPLOYMENT_FRAMEWORK=ds_inference \\\n\tDTYPE=fp16 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=4 \\\n\tCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\n# loads faster than the above one\nmicrosoft-bloom-176b:\n\tmake ui\n\n\tTOKENIZERS_PARALLELISM=false \\\n\tMODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \\\n\tMODEL_CLASS=AutoModelForCausalLM \\\n\tDEPLOYMENT_FRAMEWORK=ds_inference \\\n\tDTYPE=fp16 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=4 \\\n\tCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\nbloomz-176b:\n\tmake ui\n\n\tTOKENIZERS_PARALLELISM=false \\\n\tMODEL_NAME=bigscience/bloomz \\\n\tMODEL_CLASS=AutoModelForCausalLM \\\n\tDEPLOYMENT_FRAMEWORK=ds_inference \\\n\tDTYPE=fp16 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=4 \\\n\tCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\nbloom-176b-int8:\n\tmake ui\n\n\tTOKENIZERS_PARALLELISM=false \\\n\tMODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \\\n\tMODEL_CLASS=AutoModelForCausalLM \\\n\tDEPLOYMENT_FRAMEWORK=ds_inference \\\n\tDTYPE=int8 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=4 \\\n\tCUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\n# ------------------------- HF accelerate -------------------------\nbloom-560m:\n\tmake ui\n\n\tTOKENIZERS_PARALLELISM=false \\\n\tMODEL_NAME=bigscience/bloom-560m \\\n\tMODEL_CLASS=AutoModelForCausalLM \\\n\tDEPLOYMENT_FRAMEWORK=hf_accelerate \\\n\tDTYPE=bf16 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=32 \\\n\tCUDA_VISIBLE_DEVICES=0 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\nflan-t5-xxl:\n\tmake ui\n\n\tTOKENIZERS_PARALLELISM=false \\\n\tMODEL_NAME=google/flan-t5-xxl \\\n\tMODEL_CLASS=AutoModelForSeq2SeqLM \\\n\tDEPLOYMENT_FRAMEWORK=hf_accelerate \\\n\tDTYPE=bf16 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=4 \\\n\tCUDA_VISIBLE_DEVICES=0 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\nul2:\n\tmake ui\n\n\tTOKENIZERS_PARALLELISM=false \\\n\tMODEL_NAME=google/ul2 \\\n\tMODEL_CLASS=AutoModelForSeq2SeqLM \\\n\tDEPLOYMENT_FRAMEWORK=hf_accelerate \\\n\tDTYPE=bf16 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=4 \\\n\tCUDA_VISIBLE_DEVICES=0 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\ncodegen-mono:\n\tmake ui\n\n\tTOKENIZERS_PARALLELISM=false \\\n\tMODEL_NAME=Salesforce/codegen-16B-mono \\\n\tMODEL_CLASS=AutoModelForCausalLM \\\n\tDEPLOYMENT_FRAMEWORK=hf_accelerate \\\n\tDTYPE=bf16 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=4 \\\n\tCUDA_VISIBLE_DEVICES=0 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\n# ------------------------- HF CPU -------------------------\nbloom-560m-cpu:\n\tmake ui\n\n\tMODEL_NAME=bigscience/bloom-560m \\\n\tMODEL_CLASS=AutoModelForCausalLM \\\n\tDEPLOYMENT_FRAMEWORK=hf_cpu \\\n\tDTYPE=fp32 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=32 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n\nflan-t5-base-cpu:\n\tmake ui\n\n\tMODEL_NAME=google/flan-t5-base \\\n\tMODEL_CLASS=AutoModelForSeq2SeqLM \\\n\tDEPLOYMENT_FRAMEWORK=hf_cpu \\\n\tDTYPE=bf16 \\\n\tMAX_INPUT_LENGTH=2048 \\\n\tMAX_BATCH_SIZE=32 \\\n\tgunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s \"%(r)s\" %(s)s %(b)s'\n"
  },
  {
    "path": "README.md",
    "content": "> [!NOTE]  \n> This repository has been archived and is not being maintained any longer since a lot more efficient serving frameworks have been released recently like vLLM and TGI.\n\n# Fast Inference Solutions for BLOOM\n\nThis repo provides demos and packages to perform fast inference solutions for BLOOM. Some of the solutions have their own repos in which case a link to the [corresponding repos](#Other-inference-solutions) is provided instead.\n\n\n# Inference solutions for BLOOM 176B\n\nWe support HuggingFace accelerate and DeepSpeed Inference for generation.\n\nInstall required packages:\n\n```shell\npip install flask flask_api gunicorn pydantic accelerate huggingface_hub>=0.9.0 deepspeed>=0.7.3 deepspeed-mii==0.0.2\n```\n\nalternatively you can also install deepspeed from source:\n```shell\ngit clone https://github.com/microsoft/DeepSpeed\ncd DeepSpeed\nCFLAGS=\"-I$CONDA_PREFIX/include/\" LDFLAGS=\"-L$CONDA_PREFIX/lib/\" TORCH_CUDA_ARCH_LIST=\"7.0\" DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 pip install -e . --global-option=\"build_ext\" --global-option=\"-j8\" --no-cache -v --disable-pip-version-check\n```\n\nAll the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B (fp16/bf16) and 4 A100 80GB GPUs for BLOOM 176B (int8). These scripts might not work for other models or a different number of GPUs.\n\nDS inference is deployed using logic borrowed from DeepSpeed MII library.\n\nNote: Sometimes GPU memory is not freed when DS inference deployment crashes. You can free this memory by running `killall python` in terminal.\n\nFor using BLOOM quantized, use dtype = int8. Also, change the model_name to microsoft/bloom-deepspeed-inference-int8 for DeepSpeed-Inference. For HF accelerate, no change is needed for model_name.\n\nHF accelerate uses [LLM.int8()](https://arxiv.org/abs/2208.07339) and DS-inference uses [ZeroQuant](https://arxiv.org/abs/2206.01861) for post-training quantization.\n\n## BLOOM inference via command-line\n\nThis asks for generate_kwargs everytime.\nExample: generate_kwargs =\n```json\n{\"min_length\": 100, \"max_new_tokens\": 100, \"do_sample\": false}\n```\n\n1. using HF accelerate\n```shell\npython -m inference_server.cli --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework hf_accelerate --generate_kwargs '{\"min_length\": 100, \"max_new_tokens\": 100, \"do_sample\": false}'\n```\n\n2. using DS inference\n```shell\npython -m inference_server.cli --model_name microsoft/bloom-deepspeed-inference-fp16 --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --generate_kwargs '{\"min_length\": 100, \"max_new_tokens\": 100, \"do_sample\": false}'\n```\n\n## BLOOM server deployment\n\n[make <model_name>](../Makefile) can be used to launch a generation server. Please note that the serving method is synchronous and users have to wait in queue until the preceding requests have been processed. An example to fire server requests is given [here](./server_request.py). Alternativey, a [Dockerfile](./Dockerfile) is also provided which launches a generation server on port 5000.\n\nAn interactive UI can be launched via the following command to connect to the generation server. The default URL of the UI is `http://127.0.0.1:5001/`. The `model_name` is just used by the UI to check if the model is decoder or encoder-decoder model.\n```shell\npython -m ui --model_name bigscience/bloom\n```\nThis command launches the following UI to play with generation. Sorry for the crappy design. Unfotunately, my UI skills only go so far. 😅😅😅\n![image](assets/UI.png)\n\n## Benchmark system for BLOOM inference\n\n1. using HF accelerate\n```shell\npython -m inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework hf_accelerate --benchmark_cycles 5\n```\n\n2. using DS inference\n```shell\ndeepspeed --num_gpus 8 --module inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --benchmark_cycles 5\n```\nalternatively, to load model faster:\n```shell\ndeepspeed --num_gpus 8 --module inference_server.benchmark --model_name microsoft/bloom-deepspeed-inference-fp16 --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --benchmark_cycles 5\n```\n\n3. using DS ZeRO\n```shell\ndeepspeed --num_gpus 8 --module inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5\n```\n\n# Support\n\n\nIf you run into things not working or have other questions please open an Issue in the corresponding backend:\n\n- [Accelerate](https://github.com/huggingface/accelerate/issues)\n- [Deepspeed-Inference](https://github.com/microsoft/DeepSpeed/issues)\n- [Deepspeed-ZeRO](https://github.com/microsoft/DeepSpeed/issues)\n\nIf there a specific issue with one of the scripts and not the backend only then please open an Issue here and tag [@mayank31398](https://github.com/mayank31398).\n\n\n# Other inference solutions\n## Client-side solutions\n\nSolutions developed to perform large batch inference locally:\n\n* [Custom HF Code](https://github.com/huggingface/transformers_bloom_parallel/).\n\nJAX:\n\n* [BLOOM Inference in JAX](https://github.com/huggingface/bloom-jax-inference)\n\n\n## Server solutions\n\nA solution developed to be used in a server mode (i.e. varied batch size, varied request rate) can be found [here](https://github.com/Narsil/bloomserver). This is implemented in Rust.\n"
  },
  {
    "path": "bloom-inference-scripts/README.md",
    "content": "# Inference scripts for BLOOM\n\n## BLOOM Inference solutions\n\nHere are some benchmark resuls on JeanZay's 8x80GB A100 node w/ 512GB of CPU memory:\n\nAll benchmarks are doing greedy generation of 100 token outputs:\n```\nGenerate args {'max_length': 100, 'do_sample': False}\n```\nThe input prompt is comprised of just a few tokens.\n\nThroughput in msecs on 8x80GB gpus:\n\n| project      \\ bs |      1 |     8 |    16 |    32 |   64 |  128 |  256 | 512  |\n| :---------------- | :----- | :---- | :---- | :---- | :--- | :--- | :--- | :--- |\n| accelerate   bf16 | 230.38 | 31.78 | 17.84 | 10.89 |  oom |      |      |      |\n| accelerate   int8 | 286.56 | 40.92 | 22.65 | 13.27 |  oom |      |      |      |\n| ds-inference fp16 |  44.02 |  5.70 |  3.01 |  1.68 | 1.00 | 0.69 |  oom |      |\n| ds-inference int8 |  89.09 | 11.44 |  5.88 |  3.09 | 1.71 | 1.02 | 0.71 | oom  |\n| ds-zero      bf16 |    283 | 34.88 |   oom |       |      |      |      |      |\n\nnote: Since Deepspeed-ZeRO can process multiple generate streams in parallel its throughput can be further divided by 8 or 16, depending on whether 8 or 16 gpus were used during the generate. and, of course, it means that it can process a bs of 64 in the case of 8x80 A100 (the table above).\n\nStart to ready to generate in secs (mainly loading and data preparation time):\n\n| project                 |      |\n| :---------------------- | :--- |\n| accelerate              |  121 |\n| ds-inference shard-int8 |   61 |\n| ds-inference shard-fp16 |   60 |\n| ds-inference unsharded  |  662 |\n| ds-zero                 |  462 |\n\nNow let's look at the power of quantized int8-based models provided by [Deepspeed-Inference](https://www.deepspeed.ai/tutorials/inference-tutorial/) and [BitsNBytes](https://github.com/TimDettmers/bitsandbytes), as it requires only half the original GPU memory of inference in bfloat16 or float16.\n\nThroughput in msecs 4x80GB A100:\n\n| project      \\ bs |      1 |     8 |    16 |    32 |   64 | 128  |\n| :---------------- | :----- | :---- | :---- | :---- | :--- | :--- |\n| accelerate   int8 | 284.15 | 40.14 | 21.97 |  oom  |      |      |\n| ds-inference int8 | 156.51 | 20.11 | 10.38 |  5.50 | 2.96 | oom  |\n\nTo get the benchmark results simply add `--benchmark` to any of these 3 scripts discussed below.\n\n\n## Deepspeed-Inference\n\nDeepspeed-Inference uses Tensor-Parallelism and efficient fused CUDA kernels:\nhttps://www.deepspeed.ai/tutorials/inference-tutorial/\n\n### Setup\n\n```\npip install deepspeed>=0.7.3\n```\n\n### Run\n\n1. the fastest approach is to use a tp-pre-sharded checkpoint that takes only ~1min to load, as compared to 10min for non-presharded bloom checkpoint\n\n\n```\ndeepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-fp16\n```\n\n1a.\nif you want to run the original bloom checkpoint, which once loaded will run at the same throughput as the previous solution, but the loading will take 10-20min:\n\n```\ndeepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name bigscience/bloom\n```\n\n2a. The 8bit quantized version requires you to have only half the GPU memory of the normal half precision version:\n\n\n```\ndeepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-int8 --dtype int8\n```\n\nHere we used `microsoft/bloom-deepspeed-inference-int8` and also told the script to run in `int8`.\n\nAnd of course, just 4x80GB A100 gpus is now sufficient:\n\n```\ndeepspeed --num_gpus 4 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-int8 --dtype int8\n```\n\n\n\n## HF Accelerate\n\nHF Accelerate can use naive Pipeline Parallelism to load a huge model over multiple GPUs:\nhttps://github.com/huggingface/accelerate\n\n### Setup\n\n```\npip install transformers>=4.21.3 accelerate>=0.12.0\n```\n\n\n### Run\n\n\n```\npython bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-accelerate-inference_bs=1.txt\n```\n\nTo activate the 8bit quantized solution first install `bitsnbytes`:\n\n```\npip install bitsandbytes\n```\n\nand then add `--dtype int8` to the previous command line:\n\n```\npython bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --dtype int8 --batch_size 1 --benchmark 2>&1 | tee bloom-int8-accelerate-inference_bs=1.txt\n```\n\nif you have more than 4 GPUs you can tell it to use only 4 with:\n```\nCUDA_VISIBLE_DEVICES=0,1,2,3 python bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --dtype int8 --batch_size 1 --benchmark 2>&1 | tee bloom-int8-accelerate-inference_bs=1.txt\n```\n\n\n## Deepspeed ZeRO-Inference\n\n\n[Deepspeed ZeRO](https://www.deepspeed.ai/tutorials/zero/) uses a magical sharding approach which can take almost any model and scale it across a few or hundreds of GPUs.\n\n### Setup\n\n```\npip install deepspeed\n```\n\n\n### Run\n\nNote that the script currently runs the same inputs on all GPUs, but you can run a different stream on each GPU, and get `n_gpu` times faster throughput. You can't do that with Deepspeed-Inference.\n\n\n```\ndeepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt\n```\n\nPlease remember that with ZeRO the user can generate multiple unique streams at the same time - and thus the overall performance should be throughput in secs/token divided by number of participating gpus - so 8x to 16x faster depending on whether 8 or 16 gpus were used!\n\nYou can also try the offloading solutions with just one small GPU, which will take a long time to run, but if you don't have 8 huge GPUs this is as good as it gets.\n\n\nCPU-Offload (1x gpus):\n```\ndeepspeed --num_gpus 1 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --cpu_offload --benchmark 2>&1 | tee bloom-ds-zero-inference-cpu_offload_bs=8.txt\n```\n\nNVMe-Offload (1x gpus):\n```\ndeepspeed --num_gpus 1 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --nvme_offload_path=/path/to/nvme_offload --benchmark 2>&1 | tee bloom-ds-zero-inference-nvme_offload_bs=8.txt\n```\n\nmake sure to adjust `/path/to/nvme_offload` to somewhere you have ~400GB of free memory on a fast NVMe drive.\n\n## Support\n\nIf you run into things not working or have other questions please open an Issue in the corresponding backend:\n\n- [Accelerate](https://github.com/huggingface/accelerate/issues)\n- [Deepspeed-Inference](https://github.com/microsoft/DeepSpeed/issues)\n- [Deepspeed-ZeRO](https://github.com/microsoft/DeepSpeed/issues)\n\nIf there a specific issue with one of the scripts and not the backend only then please open an Issue here and tag [@stas00](https://github.com/stas00).\n"
  },
  {
    "path": "bloom-inference-scripts/bloom-accelerate-inference.py",
    "content": "import argparse\nimport gc\nimport math\nimport os\nimport time\n\nimport torch\nimport torch.distributed as dist\n\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--local_rank\", required=False, type=int, help=\"used by dist launchers\")\n    parser.add_argument(\"--name\", type=str, help=\"Name path\", required=True)\n    parser.add_argument(\"--batch_size\", default=1, type=int, help=\"batch size\")\n    parser.add_argument(\"--benchmark\", action=\"store_true\", help=\"additionally run benchmark\")\n    parser.add_argument(\"--greedy\", action=\"store_true\")\n    parser.add_argument(\"--top-k\", type=int, default=0)\n    parser.add_argument(\"--top-p\", type=float, default=0.0)\n    parser.add_argument(\"--dtype\", type=str, help=\"float16 or int8\", choices=[\"int8\", \"float16\"], default=\"float16\")\n\n    return parser.parse_args()\n\n\nt_start = time.time()\n\nnum_tokens = 100\n\nargs = get_args()\n\nlocal_rank = int(os.getenv(\"LOCAL_RANK\", \"0\"))\nworld_size = torch.cuda.device_count()\n\nrank = local_rank\n\n\ndef print_rank0(*msg):\n    if rank != 0:\n        return\n    print(*msg)\n\n\nprint_rank0(f\"Using {world_size} gpus\")\nmodel_name = args.name\nprint_rank0(f\"Loading model {model_name}\")\n\ntokenizer = AutoTokenizer.from_pretrained(model_name)\n\n# XXX: can't automatically derive dtype via config's `from_pretrained`\ndtype = torch.bfloat16 if model_name in [\"bigscience/bloom\", \"bigscience/bigscience-small-testing\"] else torch.float16\n\n# print(get_max_memory_per_gpu_dict())\n\ninfer_dtype = args.dtype\nif infer_dtype == \"int8\":\n    dtype = torch.int8\n\nkwargs = dict(\n    device_map=\"auto\",\n)\n\n\ndef get_world_size() -> int:\n    if dist.is_initialized():\n        return dist.get_world_size()\n    else:\n        return 1\n\n\n# balanced_low_0 - because it allows a larger batch size with multiple GPUs\nif get_world_size() > 1:\n    kwargs[\"device_map\"] = \"balanced_low_0\"\n\n\nif infer_dtype == \"int8\":\n    print_rank0(\"Using `load_in_8bit=True` to use quanitized model\")\n    kwargs[\"load_in_8bit\"] = True\nelse:\n    kwargs[\"torch_dtype\"] = dtype\n\n\nmodel = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)\n\n\nif args.benchmark:\n    t_ready = time.time()\n\n\n### Generate\n\nprint_rank0(f\"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}\")\n\ninput_sentences = [\n    \"DeepSpeed is a machine learning framework\",\n    \"He is working on\",\n    \"He has a\",\n    \"He got all\",\n    \"Everyone is happy and I can\",\n    \"The new movie that got Oscar this year\",\n    \"In the far far distance from our galaxy,\",\n    \"Peace is the only way\",\n]\n\nif args.batch_size > len(input_sentences):\n    # dynamically extend to support larger bs by repetition\n    input_sentences *= math.ceil(args.batch_size / len(input_sentences))\n\ngenerate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)\n# generate_kwargs = dict(max_new_tokens=num_tokens, use_cache=False, do_sample=False)\n# generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False)\n\nprint_rank0(f\"Generate args {generate_kwargs}\")\ninputs = input_sentences[: args.batch_size]\n\n\ndef generate():\n    \"\"\"returns a list of zipped inputs, outputs and number of new tokens\"\"\"\n\n    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors=\"pt\", padding=True)\n    for t in input_tokens:\n        if torch.is_tensor(input_tokens[t]):\n            input_tokens[t] = input_tokens[t].to(\"cuda:0\")\n\n    outputs = model.generate(**input_tokens, **generate_kwargs)\n\n    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]\n    output_tokens_lengths = [x.shape[0] for x in outputs]\n\n    total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)]\n    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n\n    return zip(inputs, outputs, total_new_tokens)\n\n\nprint_rank0(\"*** Running generate\")\nt_generate_start = time.time()\ngenerated = generate()\nt_generate_span = time.time() - t_generate_start\nfor i, o, _ in generated:\n    print_rank0(f\"{'-'*60}\\nin={i}\\nout={o}\\n\")\n\n\n### Benchmark\n\nif args.benchmark:\n    # clear cache / free memory\n    torch.cuda.empty_cache()\n    gc.collect()\n\n    print_rank0(\"*** Running benchmark\")\n    # warm up\n    for i in range(1):\n        _ = generate()\n    torch.cuda.synchronize()\n\n    # benchmark\n    t0 = time.time()\n    cycles = 5\n    total_new_tokens_generated = 0\n    for i in range(cycles):\n        generated = generate()\n        total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated)\n    torch.cuda.synchronize()\n    throughput = (time.time() - t0) / (total_new_tokens_generated)\n    print_rank0(\n        f\"\"\"\n*** Performance stats:\nThroughput per token including tokenize: {throughput*1000:.2f} msecs\nStart to ready to generate: {t_ready - t_start:.3f} secs\nTokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs\nStart to finish: {t_ready - t_start + t_generate_span:.3f} secs\n\"\"\"\n    )\n"
  },
  {
    "path": "bloom-inference-scripts/bloom-ds-inference.py",
    "content": "# usage:\n# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom\n#\n# to run benchmarks:\n# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark\n#\n\n\n# This is going to improve, but at the moment, the process is a bit cumbersome - we first use\n# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints,\n# 2. free the allocated storage\n# 3. start Deepspeed-Inference and only now load the checkpoint\n# 4. run generate\n# Done.\n#\n\n\nimport gc\nimport io\nimport json\nimport math\nimport os\nimport time\nfrom argparse import ArgumentParser\nfrom pathlib import Path\n\nimport torch\nimport torch.distributed as dist\n\nimport deepspeed\nfrom huggingface_hub import snapshot_download\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer\nfrom transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock\nfrom transformers.utils import is_offline_mode\n\n\n# the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time.\ntp_presharded_models = [\"microsoft/bloom-deepspeed-inference-int8\", \"microsoft/bloom-deepspeed-inference-fp16\"]\n\nt_start = time.time()\n\nnum_tokens = 100\n\nparser = ArgumentParser()\n\nparser.add_argument(\"--name\", required=True, type=str, help=\"model_name\")\nparser.add_argument(\"--dtype\", type=str, help=\"float16 or int8\", choices=[\"int8\", \"float16\"], default=\"float16\")\nparser.add_argument(\"--local_rank\", required=False, type=int, help=\"used by dist launchers\")\nparser.add_argument(\"--batch_size\", default=1, type=int, help=\"batch size\")\nparser.add_argument(\"--benchmark\", action=\"store_true\", help=\"additionally run benchmark\")\nargs = parser.parse_args()\n\nlocal_rank = int(os.getenv(\"LOCAL_RANK\", \"0\"))\nworld_size = int(os.getenv(\"WORLD_SIZE\", \"1\"))\n\ndeepspeed.init_distributed(\"nccl\")\nrank = dist.get_rank()\n\n\ndef print_rank0(*msg):\n    if rank != 0:\n        return\n    print(*msg)\n\n\n### Model loading and instantiating on GPUs\n\n\ndef get_repo_root(model_name_or_path):\n    # checks if online or not\n    if is_offline_mode():\n        print_rank0(\"Offline mode: forcing local_files_only=True\")\n\n    # download only on first process\n    if rank == 0:\n        snapshot_download(\n            model_name_or_path,\n            local_files_only=is_offline_mode(),\n            cache_dir=os.getenv(\"TRANSFORMERS_CACHE\", None),\n            ignore_patterns=[\"*.safetensors\"],\n        )\n\n    dist.barrier()\n\n    return snapshot_download(\n        model_name_or_path,\n        local_files_only=is_offline_mode(),\n        cache_dir=os.getenv(\"TRANSFORMERS_CACHE\", None),\n        ignore_patterns=[\"*.safetensors\"],\n    )\n\n\ndef get_checkpoint_files(model_name_or_path):\n    cached_repo_dir = get_repo_root(model_name_or_path)\n\n    # extensions: .bin | .pt\n    # creates a list of paths from all downloaded files in cache dir\n    file_list = [str(entry) for entry in Path(cached_repo_dir).rglob(\"*.[bp][it][n]\") if entry.is_file()]\n    return file_list\n\n\nmodel_name = args.name\ninfer_dtype = args.dtype\n\ntp_presharded_mode = True if model_name in tp_presharded_models else False\n\n# print(get_checkpoint_files(model_name))\n\nprint_rank0(f\"*** Loading the model {model_name}\")\n\ntokenizer = AutoTokenizer.from_pretrained(model_name)\nconfig = AutoConfig.from_pretrained(model_name)\n\n# XXX: can't automatically derive dtype via config's `from_pretrained`\n# dtype = torch.bfloat16 if model_name in [\"bigscience/bloom\", \"bigscience/bigscience-small-testing\"] else torch.float16\n\n\n# use one of these args to `init_inference`\n# 1. injection_policy is the slower version, but it's plain pytorch so it'll always work\n# 2. replace_with_kernel_inject is the faster one (fast fused kernels)\nkernel_inject = True\n# kernel_inject = False\n\nif kernel_inject:\n    # XXX: for now ds-inference only works with fp16\n    dtype = torch.float16\nelse:\n    dtype = torch.bfloat16\n\nif args.benchmark:\n    torch.cuda.empty_cache()\n    gc.collect()\n    deepspeed.runtime.utils.see_memory_usage(\"pre-from-pretrained\", force=True)\n\n# Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load\nwith deepspeed.OnDevice(dtype=dtype, device=\"meta\"):\n    model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)\n\nif args.benchmark:\n    deepspeed.runtime.utils.see_memory_usage(\"post-from-pretrained\", force=True)\n\nmodel = model.eval()\n\nif args.benchmark:\n    torch.cuda.empty_cache()\n    gc.collect()\n    deepspeed.runtime.utils.see_memory_usage(\"post-init-ds-zero-init\", force=True)\n\n### Deepspeed-Inference Loading\n\ncheckpoints_json = \"checkpoints.json\"\n\n\ndef write_checkpoints_json():\n    checkpoint_files = get_checkpoint_files(model_name)\n    if rank == 0:\n        data = {\"type\": \"BLOOM\", \"checkpoints\": checkpoint_files, \"version\": 1.0}\n        json.dump(data, open(checkpoints_json, \"w\"))\n\n\nif args.benchmark:\n    torch.cuda.empty_cache()\n    gc.collect()\n    deepspeed.runtime.utils.see_memory_usage(\"pre-ds-inference-init\", force=True)\n\nif kernel_inject:\n    kwargs = dict(replace_with_kernel_inject=True)\nelse:\n    kwargs = dict(injection_policy={BloomBlock: (\"self_attention.dense\", \"mlp.dense_4h_to_h\")})\n\nrepo_root = get_repo_root(model_name)\nif tp_presharded_mode:\n    # tp presharded repos come with their own checkpoints config file\n    checkpoints_json = os.path.join(repo_root, \"ds_inference_config.json\")\nelse:\n    # for normal bloom repo we need to write the checkpoints config file\n    write_checkpoints_json()\n    dist.barrier()\n\n# checkpoints_json=None\nmodel = deepspeed.init_inference(\n    model,\n    mp_size=world_size,\n    base_dir=repo_root,\n    dtype=getattr(torch, infer_dtype),\n    checkpoint=checkpoints_json,\n    **kwargs,\n)\n\nif args.benchmark:\n    torch.cuda.empty_cache()\n    gc.collect()\n    deepspeed.runtime.utils.see_memory_usage(\"post-ds-inference-init\", force=True)\n\n\nmodel = model.module\n\nif args.benchmark:\n    t_ready = time.time()\n\n\n### Generate\n\n\nprint_rank0(f\"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}\")\n\ninput_sentences = [\n    \"DeepSpeed is a machine learning framework\",\n    \"He is working on\",\n    \"He has a\",\n    \"He got all\",\n    \"Everyone is happy and I can\",\n    \"The new movie that got Oscar this year\",\n    \"In the far far distance from our galaxy,\",\n    \"Peace is the only way\",\n]\n\nif args.batch_size > len(input_sentences):\n    # dynamically extend to support larger bs by repetition\n    input_sentences *= math.ceil(args.batch_size / len(input_sentences))\n\ngenerate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)\n\n\nprint_rank0(f\"Generate args {generate_kwargs}\")\n\ninputs = input_sentences[: args.batch_size]\n\n\ndef generate():\n    \"\"\"returns a list of zipped inputs, outputs and number of new tokens\"\"\"\n\n    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors=\"pt\", padding=True)\n    for t in input_tokens:\n        if torch.is_tensor(input_tokens[t]):\n            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())\n\n    outputs = model.generate(**input_tokens, **generate_kwargs)\n\n    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]\n    output_tokens_lengths = [x.shape[0] for x in outputs]\n\n    total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)]\n    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n\n    return zip(inputs, outputs, total_new_tokens)\n\n\n# warmup is a must if measuring speed as it's when all the optimizations are performed\n# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs\nprint_rank0(\"*** Running generate warmup\")\n_ = generate()\n\nprint_rank0(\"*** Running generate\")\nt_generate_start = time.time()\ngenerated = generate()\nt_generate_span = time.time() - t_generate_start\nfor i, o, _ in generated:\n    print_rank0(f\"{'-'*60}\\nin={i}\\nout={o}\\n\")\n\nif args.benchmark:\n    torch.cuda.empty_cache()\n    gc.collect()\n    deepspeed.runtime.utils.see_memory_usage(\"end-of-run\", force=True)\n\n### Benchmark\n\n# benchmark it!\nif args.benchmark:\n    print_rank0(\"*** Running benchmark\")\n\n    # warm up\n    for i in range(1):\n        _ = generate()\n    torch.cuda.synchronize()\n\n    # benchmark\n    t0 = time.time()\n    cycles = 5\n    total_new_tokens_generated = 0\n    for i in range(cycles):\n        generated = generate()\n        total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated)\n    torch.cuda.synchronize()\n    throughput = (time.time() - t0) / (total_new_tokens_generated)\n    print_rank0(\n        f\"\"\"\n*** Performance stats:\nThroughput per token including tokenize: {throughput*1000:.2f} msecs\nStart to ready to generate: {t_ready - t_start:.3f} secs\nTokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs\nStart to finish: {t_ready - t_start + t_generate_span:.3f} secs\n\"\"\"\n    )\n"
  },
  {
    "path": "bloom-inference-scripts/bloom-ds-zero-inference.py",
    "content": "# usage:\n# deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom\n#\n# to run benchmarks:\n# deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --benchmark\n#\n\n\n# This is going to improve, but at the moment, the process is a bit cumbersome - we first use\n# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints,\n# 2. free the allocated storage\n# 3. start Deepspeed-Inference and only now load the checkpoint\n# 4. run generate\n# Done.\n#\n\n\nimport gc\nimport math\nimport os\nimport time\nfrom argparse import ArgumentParser\n\nimport torch\nimport torch.distributed as dist\n\nimport deepspeed\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer\nfrom transformers.deepspeed import HfDeepSpeedConfig\nfrom transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock\n\n\nt_start = time.time()\n\nnum_tokens = 100\n\nparser = ArgumentParser()\n\nparser.add_argument(\"--name\", required=True, type=str, help=\"model_name\")\nparser.add_argument(\"--local_rank\", required=False, type=int, help=\"used by dist launchers\")\nparser.add_argument(\"--batch_size\", default=1, type=int, help=\"batch size\")\nparser.add_argument(\"--benchmark\", action=\"store_true\", help=\"additionally run benchmark\")\nparser.add_argument(\"--cpu_offload\", action=\"store_true\", help=\"whether to activate CPU offload\")\nparser.add_argument(\"--nvme_offload_path\", help=\"whether to activate NVME offload and the path on nvme\")\nargs = parser.parse_args()\n\nlocal_rank = int(os.getenv(\"LOCAL_RANK\", \"0\"))\nworld_size = int(os.getenv(\"WORLD_SIZE\", \"1\"))\n\ndeepspeed.init_distributed(\"nccl\")\nrank = dist.get_rank()\n\n\ndef print_rank0(*msg):\n    if rank != 0:\n        return\n    print(*msg)\n\n\n### Model loading and instantiating on GPU (via ZeRO)\n\nmodel_name = args.name\n\nprint_rank0(f\"*** Loading the model {model_name}\")\n\ntokenizer = AutoTokenizer.from_pretrained(model_name)\nconfig = AutoConfig.from_pretrained(model_name)\n\n# XXX: can't automatically derive dtype via config's `from_pretrained`\ndtype = torch.bfloat16 if model_name in [\"bigscience/bloom\", \"bigscience/bigscience-small-testing\"] else torch.float16\n\nmodel_hidden_size = config.hidden_size\ntrain_batch_size = 1 * world_size\n\nds_config = {\n    \"fp16\": {\n        \"enabled\": dtype == torch.float16,\n    },\n    \"bf16\": {\n        \"enabled\": dtype == torch.bfloat16,\n    },\n    \"zero_optimization\": {\n        \"stage\": 3,\n        \"overlap_comm\": True,\n        \"contiguous_gradients\": True,\n        \"reduce_bucket_size\": model_hidden_size * model_hidden_size,\n        \"stage3_prefetch_bucket_size\": 0.9 * model_hidden_size * model_hidden_size,\n        \"stage3_param_persistence_threshold\": 0,\n    },\n    \"steps_per_print\": 2000,\n    \"train_batch_size\": train_batch_size,\n    \"train_micro_batch_size_per_gpu\": 1,\n    \"wall_clock_breakdown\": False,\n}\n\nif args.cpu_offload and args.nvme_offload_path:\n    raise ValueError(\"Use one of --cpu_offload or --nvme_offload_path and not both\")\n\nif args.cpu_offload:\n    ds_config[\"zero_optimization\"][\"offload_param\"] = dict(device=\"cpu\", pin_memory=True)\n\nif args.nvme_offload_path:\n    ds_config[\"zero_optimization\"][\"offload_param\"] = dict(\n        device=\"nvme\",\n        pin_memory=True,\n        nvme_path=args.nvme_offload_path,\n        buffer_size=4e9,\n    )\n\ndschf = HfDeepSpeedConfig(ds_config)  # this tells from_pretrained to instantiate directly on gpus\n\nif args.benchmark:\n    torch.cuda.empty_cache()\n    gc.collect()\n    deepspeed.runtime.utils.see_memory_usage(\"pre-from-pretrained\", force=True)\n\nmodel = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)\n\nif args.benchmark:\n    deepspeed.runtime.utils.see_memory_usage(\"post-from-pretrained\", force=True)\n\nmodel = model.eval()\n\nprint_rank0(ds_config)\n\nds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]\nds_engine.module.eval()\nmodel = ds_engine.module\n\nif args.benchmark:\n    t_ready = time.time()\n    deepspeed.runtime.utils.see_memory_usage(\"start-of-generate\", force=True)\n\n\n### Generate\n\nprint_rank0(f\"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}\")\n\ninput_sentences = [\n    \"DeepSpeed is a machine learning framework\",\n    \"He is working on\",\n    \"He has a\",\n    \"He got all\",\n    \"Everyone is happy and I can\",\n    \"The new movie that got Oscar this year\",\n    \"In the far far distance from our galaxy,\",\n    \"Peace is the only way\",\n]\n\nif args.batch_size > len(input_sentences):\n    # dynamically extend to support larger bs by repetition\n    input_sentences *= math.ceil(args.batch_size / len(input_sentences))\n\ngenerate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)\n# Important: if using multiple unique streams to avoid hanging if one generation finished early - one must also add:\n# generate_kwargs.update(synced_gpus=True)\n\nprint_rank0(f\"Generate args {generate_kwargs}\")\ninputs = input_sentences[: args.batch_size]\n\n\ndef generate():\n    \"\"\"returns a list of zipped inputs, outputs and number of new tokens\"\"\"\n\n    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors=\"pt\", padding=True)\n    for t in input_tokens:\n        if torch.is_tensor(input_tokens[t]):\n            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())\n\n    outputs = model.generate(**input_tokens, **generate_kwargs)\n\n    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]\n    output_tokens_lengths = [x.shape[0] for x in outputs]\n\n    total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)]\n    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n\n    return zip(inputs, outputs, total_new_tokens)\n\n\n# XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size\n\nprint_rank0(\"*** Running generate\")\nt_generate_start = time.time()\npairs = generate()\nt_generate_span = time.time() - t_generate_start\nfor i, o, _ in pairs:\n    print_rank0(f\"{'-'*60}\\nin={i}\\nout={o}\\n\")\n\n\n### Benchmark\n\nif args.benchmark:\n    # clear cache / free memory\n    torch.cuda.empty_cache()\n    gc.collect()\n    deepspeed.runtime.utils.see_memory_usage(\"end-of-generate\", force=True)\n\n    print_rank0(\"*** Running benchmark\")\n\n    # warm up\n    for i in range(1):\n        _ = generate()\n    torch.cuda.synchronize()\n\n    # benchmark\n    t0 = time.time()\n    cycles = 5\n    total_new_tokens_generated = 0\n    for i in range(cycles):\n        generated = generate()\n        total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated)\n\n    torch.cuda.synchronize()\n    # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs)\n    total_new_tokens_generated *= world_size\n    throughput = (time.time() - t0) / (total_new_tokens_generated)\n    print_rank0(\n        f\"\"\"\n*** Performance stats:\nThroughput per token including tokenize: {throughput*1000:.2f} msecs\nStart to ready to generate: {t_ready - t_start:.3f} secs\nTokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs\nStart to finish: {t_ready - t_start + t_generate_span:.3f} secs\n\"\"\"\n    )\n"
  },
  {
    "path": "inference_server/benchmark.py",
    "content": "import argparse\nimport gc\nfrom functools import partial\n\nimport torch\n\nfrom .constants import DS_INFERENCE, DS_ZERO\nfrom .model_handler.deployment import ModelDeployment\nfrom .models import start_inference_engine\nfrom .utils import (\n    GenerateRequest,\n    create_generate_request,\n    get_argument_parser,\n    get_dummy_batch,\n    get_world_size,\n    parse_args,\n    print_rank_0,\n    run_and_log_time,\n)\n\n\ndef benchmark_generation(model: ModelDeployment, request: GenerateRequest, cycles: int = 5):\n    # run benchmarks for number of cycles\n    total_new_tokens_generated = 0\n    for _ in range(cycles):\n        response = model.generate(request=request)\n        total_new_tokens_generated += sum(new_tokens for new_tokens in response.num_generated_tokens)\n    return total_new_tokens_generated\n\n\ndef get_benchmark_results(\n    benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int\n) -> str:\n    throughput = total_new_tokens_generated / benchmark_time\n    latency = benchmark_time / cycles\n    return f\"\"\"\n*** Performance stats:\nThroughput (including tokenization) = {throughput:.2f} tokens/sec\nThroughput (including tokenization) = {1000 / throughput:.2f} msecs/token\nModel loading time = {initialization_time:.2f} secs\nTotal tokens generated = {total_new_tokens_generated} with batch size = {batch_size}\nLatency = {latency:.2f} secs\nModel loading time + generation time per batch = {initialization_time + latency:.2f} secs\n\"\"\"\n\n\ndef benchmark_end_to_end(args: argparse.Namespace) -> None:\n    model, initialization_time = run_and_log_time(partial(ModelDeployment, args=args, grpc_allowed=False))\n\n    request = create_generate_request(get_dummy_batch(args.batch_size), args.generate_kwargs)\n\n    print_rank_0(f\"generate_kwargs = {args.generate_kwargs}\")\n    print_rank_0(f\"batch_size = {args.batch_size}\")\n\n    # warmup is a must if measuring speed as it's when all the optimizations are performed\n    # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs\n    response = model.generate(request=request)\n\n    for i, (o, _) in zip(request.text, zip(response.text, response.num_generated_tokens)):\n        print_rank_0(f\"{'-' * 60}\\nin = {i}\\nout = {o}\\n\")\n\n    if args.benchmark_cycles > 0:\n        print_rank_0(\"*** Running benchmark\")\n\n        torch.cuda.empty_cache()\n        gc.collect()\n\n        # warm up\n        model.generate(request=request)\n        torch.cuda.synchronize()\n\n        # benchmark\n        total_new_tokens_generated, benchmark_time = run_and_log_time(\n            partial(benchmark_generation, model=model, request=request, cycles=args.benchmark_cycles)\n        )\n\n        # with ZeRO every GPU is generating batch_size * sequence_length tokens\n        if args.deployment_framework == DS_ZERO:\n            total_new_tokens_generated *= get_world_size()\n\n        print_rank_0(\n            get_benchmark_results(\n                benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles\n            )\n        )\n\n\ndef get_args() -> argparse.Namespace:\n    parser = get_argument_parser()\n\n    group = parser.add_argument_group(title=\"launch config\")\n    group.add_argument(\"--benchmark_cycles\", type=int, default=0, help=\"additionally run benchmark\")\n    group.add_argument(\"--local_rank\", required=False, type=int, help=\"used by dist launchers\")\n    group.add_argument(\"--batch_size\", default=1, type=int, help=\"batch size\")\n    group.add_argument(\"--cpu_offload\", action=\"store_true\", help=\"whether to activate CPU offload for DS ZeRO\")\n\n    args = parse_args(parser)\n\n    launched_with_deepspeed = args.deployment_framework in [DS_INFERENCE, DS_ZERO]\n\n    assert args.max_batch_size == None, \"max_batch_size is not supported with benchmark\"\n\n    if not launched_with_deepspeed:\n        assert args.local_rank == None, \"local_rank must be None if not launched with DeepSpeed\"\n\n    if args.cpu_offload:\n        assert args.deployment_framework == DS_ZERO, \"cpu_offload only works with DS_ZeRO\"\n\n    return args\n\n\ndef main() -> None:\n    args = get_args()\n    start_inference_engine(args.deployment_framework)\n    benchmark_end_to_end(args)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "inference_server/cli.py",
    "content": "import argparse\nimport json\nimport sys\n\nfrom .model_handler import ModelDeployment\nfrom .utils import get_argument_parser, parse_args, print_rank_0\n\n\ndef get_args() -> argparse.Namespace:\n    parser = get_argument_parser()\n    args = parse_args(parser)\n    return args\n\n\ndef main() -> None:\n    args = get_args()\n\n    model = ModelDeployment(args, True)\n\n    generate_kwargs = args.generate_kwargs\n\n    while True:\n        input_text = input(\"Input text: \")\n\n        if input(\"change generate_kwargs? [y/n] \") == \"y\":\n            while True:\n                try:\n                    generate_kwargs = json.loads(input(\"Generate kwargs: \"))\n                    break\n                except Exception as e:\n                    e_type, e_message, _ = sys.exc_info()\n                    print(\"error =\", e_type.__name__)\n                    print(\"message =\", e_message)\n                    continue\n\n        response = model.generate(text=[input_text], generate_kwargs=generate_kwargs)\n\n        print_rank_0(\"Output text:\", response.text[0])\n        print_rank_0(\"Generated tokens:\", response.num_generated_tokens[0])\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "inference_server/constants.py",
    "content": "# inference method (args.deployment_framework)\nHF_ACCELERATE = \"hf_accelerate\"\nHF_CPU = \"hf_cpu\"\nDS_INFERENCE = \"ds_inference\"\nDS_ZERO = \"ds_zero\"\n\n# GRPC_MAX_MSG_SIZE = 2**30  # 1GB\n"
  },
  {
    "path": "inference_server/download_model.py",
    "content": "import argparse\n\nfrom inference_server.models import get_hf_model_class\nfrom transformers import AutoConfig, AutoTokenizer\n\n\ndef get_args() -> argparse.Namespace:\n    parser = argparse.ArgumentParser()\n\n    parser.add_argument(\n        \"--model_name\",\n        type=str,\n        required=True,\n        help=\"model to use\",\n    )\n    parser.add_argument(\n        \"--model_class\",\n        type=str,\n        required=True,\n        help=\"model class to use\",\n    )\n\n    args = parser.parse_args()\n\n    return args\n\n\ndef main() -> None:\n    args = get_args()\n    print(\"downloading\", args.model_name)\n    AutoConfig.from_pretrained(args.model_name)\n    AutoTokenizer.from_pretrained(args.model_name)\n    get_hf_model_class(args.model_class).from_pretrained(args.model_name)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "inference_server/model_handler/__init__.py",
    "content": "from .deployment import ModelDeployment\n"
  },
  {
    "path": "inference_server/model_handler/deployment.py",
    "content": "\"\"\"\nCopyright 2022 The Microsoft DeepSpeed Team\n\"\"\"\nimport argparse\nimport asyncio\nimport subprocess\nimport time\nfrom typing import List\n\nimport grpc\n\nfrom ..constants import DS_INFERENCE, DS_ZERO\nfrom ..models import get_model_class, load_tokenizer\nfrom ..utils import (\n    ForwardRequest,\n    ForwardResponse,\n    GenerateResponse,\n    TokenizeRequest,\n    TokenizeResponse,\n    create_generate_request,\n    get_cuda_visible_devices,\n    get_str_dtype,\n    get_world_size,\n    print_rank_0,\n)\nfrom .grpc_utils.pb import generation_pb2, generation_pb2_grpc\n\n\nclass ModelDeployment:\n    def __init__(self, args: argparse.Namespace, grpc_allowed: bool = False):\n        self.cuda_visible_devices = get_cuda_visible_devices()\n        self.num_gpus = get_world_size()\n\n        self.use_grpc_server = self.should_use_grpc(args.deployment_framework, grpc_allowed)\n\n        if self.use_grpc_server:\n            self.tokenizer = load_tokenizer(args.model_name)\n\n            self.initialize_ports()\n\n            self.dtype_proto_field = {\n                str: \"svalue\",\n                int: \"ivalue\",\n                float: \"fvalue\",\n                bool: \"bvalue\",\n            }\n\n            self._initialize_service(args)\n            self._wait_until_server_is_live()\n\n            self.asyncio_loop = asyncio.get_event_loop()\n            self._initialize_grpc_client()\n        else:\n            self.model = get_model_class(args.deployment_framework)(args)\n\n        print_rank_0(\"model loaded\")\n\n    def should_use_grpc(self, deployment_framework: str, grpc_allowed: bool) -> bool:\n        if grpc_allowed and get_world_size() > 1:\n            return deployment_framework in [DS_INFERENCE, DS_ZERO]\n        return False\n\n    def initialize_ports(self):\n        self.ports = []\n        for i in range(self.num_gpus):\n            self.ports.append(50950 + self.cuda_visible_devices[i])\n\n    def _is_socket_open(self, port):\n        import socket\n\n        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        result = sock.connect_ex((\"0.0.0.0\", port))\n        sock.close()\n        return result == 0\n\n    def _is_server_process_alive(self):\n        if self.process is None:\n            return True\n        try:\n            self.process.wait(1)\n        except subprocess.TimeoutExpired as err:\n            # timeout means we're still running and all (probably) okay\n            is_alive = True\n        else:\n            # no exception case\n            is_alive = False\n        return is_alive\n\n    def _wait_until_server_is_live(self):\n        sockets_open = False\n        while not sockets_open:\n            sockets_open = self._is_socket_open(self.ports[0])\n            process_alive = self._is_server_process_alive()\n            if not process_alive:\n                raise RuntimeError(\"server crashed for some reason, unable to proceed\")\n            time.sleep(4)\n            print_rank_0(\"waiting for server to start...\")\n        print_rank_0(f\"server has started on {self.ports[0]}\")\n\n    def dict_to_proto(self, generate_kwargs: dict) -> dict:\n        result = {}\n        for k, v in generate_kwargs.items():\n            if v is not None:\n                x = generation_pb2.Value()\n                setattr(x, self.dtype_proto_field[type(v)], v)\n                result[k] = x\n\n        return result\n\n    def _initialize_service(self, args: argparse.Namespace):\n        if self._is_socket_open(self.ports[0]):\n            raise RuntimeError(\n                f\"Server is already running on port {self.ports}, please shutdown or use different port.\"\n            )\n\n        if args.deployment_framework in [DS_INFERENCE, DS_ZERO]:\n            ports = \" \".join(map(str, self.ports))\n\n            cmd = f\"inference_server.model_handler.launch --model_name {args.model_name} --deployment_framework {args.deployment_framework} --dtype {get_str_dtype(args.dtype)} --port {ports} --model_class {args.model_class}\"\n\n            if args.max_batch_size is not None:\n                cmd += f\" --max_batch_size {args.max_batch_size}\"\n            if args.max_input_length is not None:\n                cmd += f\" --max_input_length {args.max_input_length}\"\n\n            master_port = 29500 + min(self.cuda_visible_devices)\n\n            cuda_visible_devices = \",\".join(map(str, self.cuda_visible_devices))\n\n            cmd = f\"deepspeed --master_port {master_port} --include localhost:{cuda_visible_devices} --module {cmd}\"\n        else:\n            raise NotImplementedError(f\"unsupported deployment_framework: {args.deployment_framework}\")\n\n        cmd = cmd.split(\" \")\n        self.process = subprocess.Popen(cmd)\n\n    def _initialize_grpc_client(self):\n        self.stubs = []\n        for i in self.ports:\n            channel = grpc.aio.insecure_channel(f\"localhost:{i}\")\n            stub = generation_pb2_grpc.GenerationServiceStub(channel)\n            self.stubs.append(stub)\n\n    # runs task in parallel and return the result from the first task\n    async def generate_in_tensor_parallel(self, text: List[str], generate_kwargs: dict):\n        responses = []\n        for i in range(self.num_gpus):\n            responses.append(self.asyncio_loop.create_task(self.generate_async(i, text, generate_kwargs)))\n\n        await responses[0]\n        return responses[0]\n\n    async def generate_async(self, stub_id: int, text: List[str], generate_kwargs: dict):\n        req = generation_pb2.GenerationRequestProto(texts=text, generate_kwargs=generate_kwargs)\n        response = await self.stubs[stub_id].Generate(req)\n        return response\n\n    # runs task in parallel and return the result from the first task\n    async def forward_in_tensor_parallel(self, conditioning_text: List[str], response: List[str]):\n        responses = []\n        for i in range(self.num_gpus):\n            responses.append(self.asyncio_loop.create_task(self.forward_async(i, conditioning_text, response)))\n\n        await responses[0]\n        return responses[0]\n\n    async def forward_async(self, stub_id: int, conditioning_text: List[str], response: List[str]):\n        req = generation_pb2.ForwardRequestProto(conditioning_text=conditioning_text, response=response)\n        response = await self.stubs[stub_id].Forward(req)\n        return response\n\n    def generate(self, **kwargs) -> GenerateResponse:\n        if self.use_grpc_server:\n            if \"request\" in kwargs:\n                text = kwargs[\"request\"].text\n                generate_kwargs = kwargs[\"request\"].get_generate_kwargs()\n            else:\n                text = kwargs[\"text\"]\n                generate_kwargs = kwargs[\"generate_kwargs\"]\n\n            generate_kwargs = self.dict_to_proto(generate_kwargs)\n\n            response = self.asyncio_loop.run_until_complete(\n                self.generate_in_tensor_parallel(text, generate_kwargs)\n            ).result()\n\n            if response.error:\n                raise Exception(response.error)\n            else:\n                return GenerateResponse(\n                    text=[r for r in response.texts], num_generated_tokens=[n for n in response.num_generated_tokens]\n                )\n        else:\n            if \"request\" in kwargs:\n                request = kwargs[\"request\"]\n            else:\n                request = create_generate_request(**kwargs)\n\n            response = self.model.generate(request)\n\n            if isinstance(response, Exception):\n                raise response\n            else:\n                return response\n\n    def forward(self, request: ForwardRequest) -> ForwardResponse:\n        if self.use_grpc_server:\n            response = self.asyncio_loop.run_until_complete(\n                self.forward_in_tensor_parallel(request.conditioning_text, request.response)\n            ).result()\n\n            if response.error:\n                raise Exception(response.error)\n            else:\n                return ForwardResponse(nll=response.nll)\n        else:\n            response = self.model.forward(request)\n\n            if isinstance(response, Exception):\n                raise response\n            else:\n                return response\n\n    def tokenize(self, request: TokenizeRequest) -> TokenizeResponse:\n        if self.use_grpc_server:\n            response = self.tokenizer(request.text, padding=request.padding)\n            response = TokenizeResponse(token_ids=response.input_ids, attention_mask=response.attention_mask)\n        else:\n            response = self.model.tokenize(request)\n\n        return response\n"
  },
  {
    "path": "inference_server/model_handler/grpc_utils/__init__.py",
    "content": ""
  },
  {
    "path": "inference_server/model_handler/grpc_utils/generation_server.py",
    "content": "import os\nfrom concurrent import futures\n\nimport torch\n\nimport grpc\n\n# from ...constants import GRPC_MAX_MSG_SIZE\nfrom ...models import Model\nfrom ...utils import ForwardRequest, TokenizeRequest, create_generate_request, print_rank_0\nfrom .pb import generation_pb2, generation_pb2_grpc\n\n\nclass GenerationServer(generation_pb2_grpc.GenerationServiceServicer):\n    def __init__(self, model: Model) -> None:\n        self.model = model\n\n    def _unpack_proto_query_kwargs(self, query_kwargs):\n        query_kwargs = {k: getattr(v, v.WhichOneof(\"oneof_values\")) for k, v in query_kwargs.items()}\n        return query_kwargs\n\n    def Generate(self, request, context):\n        text = [r for r in request.texts]\n        generate_kwargs = self._unpack_proto_query_kwargs(request.generate_kwargs)\n\n        request = create_generate_request(text=text, generate_kwargs=generate_kwargs)\n\n        local_rank = int(os.getenv(\"LOCAL_RANK\", \"0\"))\n        torch.cuda.set_device(local_rank)\n        self.model.input_device = local_rank\n\n        response = self.model.generate(request)\n\n        if isinstance(response, Exception):\n            # if exception occurs, we don't this subprocess to crash\n            response = generation_pb2.GenerationResponseProto(\n                error=str(response), is_encoder_decoder=response.is_encoder_decoder\n            )\n        else:\n            response = generation_pb2.GenerationResponseProto(\n                texts=response.text,\n                num_generated_tokens=response.num_generated_tokens,\n                is_encoder_decoder=response.is_encoder_decoder,\n            )\n\n        return response\n\n    def Forward(self, request, context):\n        conditioning_text = [r for r in request.conditioning_text]\n        response = [r for r in request.response]\n\n        request = ForwardRequest(conditioning_text=conditioning_text, response=response)\n\n        local_rank = int(os.getenv(\"LOCAL_RANK\", \"0\"))\n        torch.cuda.set_device(local_rank)\n        self.model.input_device = local_rank\n\n        response = self.model.forward(request)\n\n        if isinstance(response, Exception):\n            # if exception occurs, we don't this subprocess to crash\n            response = generation_pb2.ForwardResponseProto(\n                error=str(response), is_encoder_decoder=response.is_encoder_decoder\n            )\n        else:\n            response = generation_pb2.ForwardResponseProto(\n                nll=response.nll, is_encoder_decoder=response.is_encoder_decoder\n            )\n\n        return response\n\n\ndef serve(inference_pipeline, port):\n    server = grpc.server(\n        futures.ThreadPoolExecutor(max_workers=1),\n        # options=[\n        #     (\"grpc.max_send_message_length\", GRPC_MAX_MSG_SIZE),\n        #     (\"grpc.max_receive_message_length\", GRPC_MAX_MSG_SIZE),\n        # ],\n    )\n    generation_pb2_grpc.add_GenerationServiceServicer_to_server(GenerationServer(inference_pipeline), server)\n    server.add_insecure_port(f\"[::]:{port}\")\n    print_rank_0(\"About to start server\")\n    server.start()\n    print_rank_0(\"Started\")\n    server.wait_for_termination()\n"
  },
  {
    "path": "inference_server/model_handler/grpc_utils/pb/__init__.py",
    "content": ""
  },
  {
    "path": "inference_server/model_handler/grpc_utils/pb/generation_pb2.py",
    "content": "# -*- coding: utf-8 -*-\n# Generated by the protocol buffer compiler.  DO NOT EDIT!\n# source: generation.proto\n\"\"\"Generated protocol buffer code.\"\"\"\nfrom google.protobuf import descriptor as _descriptor\nfrom google.protobuf import descriptor_pool as _descriptor_pool\nfrom google.protobuf import symbol_database as _symbol_database\nfrom google.protobuf.internal import builder as _builder\n\n\n# @@protoc_insertion_point(imports)\n\n_sym_db = _symbol_database.Default()\n\n\nDESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(\n    b'\\n\\x10generation.proto\\x12\\ngeneration\"_\\n\\x05Value\\x12\\x10\\n\\x06svalue\\x18\\x01 \\x01(\\tH\\x00\\x12\\x10\\n\\x06ivalue\\x18\\x02 \\x01(\\x03H\\x00\\x12\\x10\\n\\x06\\x66value\\x18\\x03 \\x01(\\x02H\\x00\\x12\\x10\\n\\x06\\x62value\\x18\\x04 \\x01(\\x08H\\x00\\x42\\x0e\\n\\x0coneof_values\"\\xc2\\x01\\n\\x16GenerationRequestProto\\x12\\r\\n\\x05texts\\x18\\x01 \\x03(\\t\\x12O\\n\\x0fgenerate_kwargs\\x18\\x02 \\x03(\\x0b\\x32\\x36.generation.GenerationRequestProto.GenerateKwargsEntry\\x1aH\\n\\x13GenerateKwargsEntry\\x12\\x0b\\n\\x03key\\x18\\x01 \\x01(\\t\\x12 \\n\\x05value\\x18\\x02 \\x01(\\x0b\\x32\\x11.generation.Value:\\x02\\x38\\x01\"q\\n\\x17GenerationResponseProto\\x12\\r\\n\\x05texts\\x18\\x01 \\x03(\\t\\x12\\x1c\\n\\x14num_generated_tokens\\x18\\x02 \\x03(\\x05\\x12\\r\\n\\x05\\x65rror\\x18\\x03 \\x01(\\t\\x12\\x1a\\n\\x12is_encoder_decoder\\x18\\x04 \\x01(\\x08\"B\\n\\x13\\x46orwardRequestProto\\x12\\x19\\n\\x11\\x63onditioning_text\\x18\\x01 \\x03(\\t\\x12\\x10\\n\\x08response\\x18\\x02 \\x03(\\t\"N\\n\\x14\\x46orwardResponseProto\\x12\\x0b\\n\\x03nll\\x18\\x01 \\x01(\\x02\\x12\\r\\n\\x05\\x65rror\\x18\\x02 \\x01(\\t\\x12\\x1a\\n\\x12is_encoder_decoder\\x18\\x03 \\x01(\\x08\\x32\\xba\\x01\\n\\x11GenerationService\\x12U\\n\\x08Generate\\x12\".generation.GenerationRequestProto\\x1a#.generation.GenerationResponseProto\"\\x00\\x12N\\n\\x07\\x46orward\\x12\\x1f.generation.ForwardRequestProto\\x1a .generation.ForwardResponseProto\"\\x00\\x62\\x06proto3'\n)\n\n_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())\n_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, \"generation_pb2\", globals())\nif _descriptor._USE_C_DESCRIPTORS == False:\n    DESCRIPTOR._options = None\n    _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._options = None\n    _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_options = b\"8\\001\"\n    _VALUE._serialized_start = 32\n    _VALUE._serialized_end = 127\n    _GENERATIONREQUESTPROTO._serialized_start = 130\n    _GENERATIONREQUESTPROTO._serialized_end = 324\n    _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_start = 252\n    _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_end = 324\n    _GENERATIONRESPONSEPROTO._serialized_start = 326\n    _GENERATIONRESPONSEPROTO._serialized_end = 439\n    _FORWARDREQUESTPROTO._serialized_start = 441\n    _FORWARDREQUESTPROTO._serialized_end = 507\n    _FORWARDRESPONSEPROTO._serialized_start = 509\n    _FORWARDRESPONSEPROTO._serialized_end = 587\n    _GENERATIONSERVICE._serialized_start = 590\n    _GENERATIONSERVICE._serialized_end = 776\n# @@protoc_insertion_point(module_scope)\n"
  },
  {
    "path": "inference_server/model_handler/grpc_utils/pb/generation_pb2_grpc.py",
    "content": "# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!\n\"\"\"Client and server classes corresponding to protobuf-defined services.\"\"\"\nimport grpc\n\nfrom . import generation_pb2 as generation__pb2\n\n\nclass GenerationServiceStub(object):\n    \"\"\"Missing associated documentation comment in .proto file.\"\"\"\n\n    def __init__(self, channel):\n        \"\"\"Constructor.\n\n        Args:\n            channel: A grpc.Channel.\n        \"\"\"\n        self.Generate = channel.unary_unary(\n            \"/generation.GenerationService/Generate\",\n            request_serializer=generation__pb2.GenerationRequestProto.SerializeToString,\n            response_deserializer=generation__pb2.GenerationResponseProto.FromString,\n        )\n        self.Forward = channel.unary_unary(\n            \"/generation.GenerationService/Forward\",\n            request_serializer=generation__pb2.ForwardRequestProto.SerializeToString,\n            response_deserializer=generation__pb2.ForwardResponseProto.FromString,\n        )\n\n\nclass GenerationServiceServicer(object):\n    \"\"\"Missing associated documentation comment in .proto file.\"\"\"\n\n    def Generate(self, request, context):\n        \"\"\"Missing associated documentation comment in .proto file.\"\"\"\n        context.set_code(grpc.StatusCode.UNIMPLEMENTED)\n        context.set_details(\"Method not implemented!\")\n        raise NotImplementedError(\"Method not implemented!\")\n\n    def Forward(self, request, context):\n        \"\"\"Missing associated documentation comment in .proto file.\"\"\"\n        context.set_code(grpc.StatusCode.UNIMPLEMENTED)\n        context.set_details(\"Method not implemented!\")\n        raise NotImplementedError(\"Method not implemented!\")\n\n\ndef add_GenerationServiceServicer_to_server(servicer, server):\n    rpc_method_handlers = {\n        \"Generate\": grpc.unary_unary_rpc_method_handler(\n            servicer.Generate,\n            request_deserializer=generation__pb2.GenerationRequestProto.FromString,\n            response_serializer=generation__pb2.GenerationResponseProto.SerializeToString,\n        ),\n        \"Forward\": grpc.unary_unary_rpc_method_handler(\n            servicer.Forward,\n            request_deserializer=generation__pb2.ForwardRequestProto.FromString,\n            response_serializer=generation__pb2.ForwardResponseProto.SerializeToString,\n        ),\n    }\n    generic_handler = grpc.method_handlers_generic_handler(\"generation.GenerationService\", rpc_method_handlers)\n    server.add_generic_rpc_handlers((generic_handler,))\n\n\n# This class is part of an EXPERIMENTAL API.\nclass GenerationService(object):\n    \"\"\"Missing associated documentation comment in .proto file.\"\"\"\n\n    @staticmethod\n    def Generate(\n        request,\n        target,\n        options=(),\n        channel_credentials=None,\n        call_credentials=None,\n        insecure=False,\n        compression=None,\n        wait_for_ready=None,\n        timeout=None,\n        metadata=None,\n    ):\n        return grpc.experimental.unary_unary(\n            request,\n            target,\n            \"/generation.GenerationService/Generate\",\n            generation__pb2.GenerationRequestProto.SerializeToString,\n            generation__pb2.GenerationResponseProto.FromString,\n            options,\n            channel_credentials,\n            insecure,\n            call_credentials,\n            compression,\n            wait_for_ready,\n            timeout,\n            metadata,\n        )\n\n    @staticmethod\n    def Forward(\n        request,\n        target,\n        options=(),\n        channel_credentials=None,\n        call_credentials=None,\n        insecure=False,\n        compression=None,\n        wait_for_ready=None,\n        timeout=None,\n        metadata=None,\n    ):\n        return grpc.experimental.unary_unary(\n            request,\n            target,\n            \"/generation.GenerationService/Forward\",\n            generation__pb2.ForwardRequestProto.SerializeToString,\n            generation__pb2.ForwardResponseProto.FromString,\n            options,\n            channel_credentials,\n            insecure,\n            call_credentials,\n            compression,\n            wait_for_ready,\n            timeout,\n            metadata,\n        )\n"
  },
  {
    "path": "inference_server/model_handler/grpc_utils/proto/generation.proto",
    "content": "syntax = \"proto3\";\npackage generation;\n\nservice GenerationService {\n    rpc Generate (GenerationRequestProto) returns (GenerationResponseProto) {}\n    rpc Forward (ForwardRequestProto) returns (ForwardResponseProto) {}\n}\n\nmessage Value {\n    oneof oneof_values {\n        string svalue = 1;\n        int64 ivalue = 2;\n        float fvalue = 3;\n        bool bvalue = 4;\n    }\n}\n\nmessage GenerationRequestProto {\n    repeated string texts = 1;\n    map<string,Value> generate_kwargs = 2;\n}\n\nmessage GenerationResponseProto {\n    repeated string texts = 1;\n    repeated int32 num_generated_tokens = 2;\n    string error = 3;\n    bool is_encoder_decoder = 4;\n}\n\nmessage ForwardRequestProto {\n    repeated string conditioning_text = 1;\n    repeated string response = 2;\n}\n\nmessage ForwardResponseProto {\n    float nll = 1;\n    string error = 2;\n    bool is_encoder_decoder = 3;\n}\n"
  },
  {
    "path": "inference_server/model_handler/launch.py",
    "content": "\"\"\"\nCopyright 2022 The Microsoft DeepSpeed Team\n\"\"\"\nimport argparse\n\nimport torch.distributed as dist\n\nfrom ..models import get_model_class, start_inference_engine\nfrom ..utils import get_argument_parser, parse_args\nfrom .grpc_utils.generation_server import serve\n\n\ndef get_args() -> argparse.Namespace:\n    parser = get_argument_parser()\n\n    group = parser.add_argument_group(title=\"launch config\")\n    group.add_argument(\"--local_rank\", required=False, type=int, help=\"used by dist launchers\")\n    group.add_argument(\"--cpu_offload\", action=\"store_true\", help=\"whether to activate CPU offload for DS ZeRO\")\n    group.add_argument(\"--ports\", nargs=\"+\", help=\"GRPC ports\")\n\n    args = parse_args(parser)\n\n    return args\n\n\ndef main():\n    args = get_args()\n    start_inference_engine(args.deployment_framework)\n    model = get_model_class(args.deployment_framework)(args)\n    serve(model, args.ports[dist.get_rank()])\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "inference_server/models/__init__.py",
    "content": "from ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE, HF_CPU\nfrom .model import Model, get_hf_model_class, load_tokenizer\n\n\ndef get_model_class(deployment_framework: str):\n    if deployment_framework == HF_ACCELERATE:\n        from .hf_accelerate import HFAccelerateModel\n\n        return HFAccelerateModel\n    elif deployment_framework == HF_CPU:\n        from .hf_cpu import HFCPUModel\n\n        return HFCPUModel\n    elif deployment_framework == DS_INFERENCE:\n        from .ds_inference import DSInferenceModel\n\n        return DSInferenceModel\n    elif deployment_framework == DS_ZERO:\n        from .ds_zero import DSZeROModel\n\n        return DSZeROModel\n    else:\n        raise ValueError(f\"Unknown deployment framework {deployment_framework}\")\n\n\ndef start_inference_engine(deployment_framework: str) -> None:\n    if deployment_framework in [DS_INFERENCE, DS_ZERO]:\n        import deepspeed\n\n        deepspeed.init_distributed(\"nccl\")\n"
  },
  {
    "path": "inference_server/models/ds_inference.py",
    "content": "import glob\nimport io\nimport json\nimport os\nfrom argparse import Namespace\nfrom functools import partial\n\nimport torch\n\nimport deepspeed\nfrom huggingface_hub import try_to_load_from_cache\nfrom transformers import AutoConfig\n\nfrom ..utils import get_world_size, run_rank_n\nfrom .model import Model, get_hf_model_class\n\n\n# basic DeepSpeed inference model class for benchmarking\nclass DSInferenceModel(Model):\n    def __init__(self, args: Namespace) -> None:\n        super().__init__(args)\n\n        # create dummy tensors for allocating space which will be filled with\n        # the actual weights while calling deepspeed.init_inference in the\n        # following code\n        with deepspeed.OnDevice(dtype=torch.float16, device=\"meta\"):\n            self.model = get_hf_model_class(args.model_class).from_config(\n                AutoConfig.from_pretrained(args.model_name), torch_dtype=torch.bfloat16\n            )\n        self.model = self.model.eval()\n\n        downloaded_model_path = get_model_path(args.model_name)\n\n        if args.dtype in [torch.float16, torch.int8]:\n            # We currently support the weights provided by microsoft (which are\n            # pre-sharded)\n            checkpoints_json = os.path.join(downloaded_model_path, \"ds_inference_config.json\")\n\n            if os.path.isfile(checkpoints_json):\n                self.model = deepspeed.init_inference(\n                    self.model,\n                    mp_size=get_world_size(),\n                    base_dir=downloaded_model_path,\n                    dtype=args.dtype,\n                    checkpoint=checkpoints_json,\n                    replace_with_kernel_inject=True,\n                )\n            else:\n                # for bigscience/bloom, sharding is done while loading the model\n                # so this is much slower and for this we need to create a\n                # checkpoints json\n                with TemporaryCheckpointsJSON(downloaded_model_path) as checkpoints_json:\n                    self.model = deepspeed.init_inference(\n                        self.model,\n                        mp_size=get_world_size(),\n                        base_dir=downloaded_model_path,\n                        dtype=args.dtype,\n                        checkpoint=checkpoints_json,\n                        replace_with_kernel_inject=True,\n                    )\n        elif args.dtype == torch.bfloat16:\n            # currently ds-inference only supports fp16 CUDA kernels :(\n            raise NotImplementedError(\"bfloat16 is not yet supported\")\n\n        self.model = self.model.module\n        self.input_device = torch.cuda.current_device()\n\n        self.post_init(args.model_name)\n\n\nclass TemporaryCheckpointsJSON:\n    def __init__(self, model_path: str):\n        self.tmp_directory = \"tmp\"\n        self.tmp_file = os.path.join(self.tmp_directory, \"checkpoints.json\")\n        self.model_path = model_path\n\n    def write_checkpoints_json(self) -> None:\n        print(self.model_path)\n        with io.open(self.tmp_file, \"w\", encoding=\"utf-8\") as f:\n            data = {\"type\": \"BLOOM\", \"checkpoints\": glob.glob(f\"{self.model_path}/*.bin\"), \"version\": 1.0}\n            json.dump(data, f)\n\n    def __enter__(self):\n        run_rank_n(os.makedirs, barrier=True)(self.tmp_directory, exist_ok=True)\n        run_rank_n(self.write_checkpoints_json, barrier=True)()\n        return self.tmp_file\n\n    def __exit__(self, type, value, traceback):\n        return\n\n\ndef get_model_path(model_name: str):\n    try:\n        config_file = \"config.json\"\n\n        # will fall back to HUGGINGFACE_HUB_CACHE\n        config_path = try_to_load_from_cache(model_name, config_file, cache_dir=os.getenv(\"TRANSFORMERS_CACHE\"))\n\n        if config_path is None:\n            # treat the model name as an explicit model path\n            return model_name\n        else:\n            return os.path.dirname(config_path)\n    except:\n        # treat the model name as an explicit model path\n        return model_name\n"
  },
  {
    "path": "inference_server/models/ds_zero.py",
    "content": "from argparse import Namespace\n\nimport torch\n\nimport deepspeed\nfrom transformers import AutoConfig\nfrom transformers.deepspeed import HfDeepSpeedConfig\n\nfrom ..utils import get_world_size\nfrom .model import Model, get_hf_model_class\n\n\nclass DSZeROModel(Model):\n    def __init__(self, args: Namespace) -> None:\n        super().__init__(args)\n\n        config = AutoConfig.from_pretrained(args.model_name)\n\n        train_micro_batch_size_per_gpu = 1\n        train_batch_size = train_micro_batch_size_per_gpu * get_world_size()\n\n        # try playing with these parameters, might improve throughput for you\n        # hardware setup\n        ds_config = {\n            \"fp16\": {\n                \"enabled\": args.dtype == torch.float16,\n            },\n            \"bf16\": {\n                \"enabled\": args.dtype == torch.bfloat16,\n            },\n            \"zero_optimization\": {\n                \"stage\": 3,\n                \"overlap_comm\": True,\n                \"contiguous_gradients\": True,\n                \"reduce_bucket_size\": config.hidden_size * config.hidden_size,\n                \"stage3_prefetch_bucket_size\": 0.9 * config.hidden_size * config.hidden_size,\n                \"stage3_param_persistence_threshold\": 0,\n            },\n            \"steps_per_print\": 2000,\n            \"train_batch_size\": train_batch_size,\n            \"train_micro_batch_size_per_gpu\": train_micro_batch_size_per_gpu,\n            \"wall_clock_breakdown\": False,\n        }\n\n        if args.cpu_offload:\n            ds_config[\"zero_optimization\"][\"offload_param\"] = {\"device\": \"cpu\", \"pin_memory\": True}\n\n        # this tells from_pretrained to instantiate directly on gpus\n        dschf = HfDeepSpeedConfig(ds_config)\n\n        self.model = get_hf_model_class(args.model_class).from_pretrained(args.model_name, torch_dtype=args.dtype)\n        self.model = self.model.eval()\n\n        # convert model to a fully sharded model using ZeRO\n        self.model = deepspeed.initialize(model=self.model, config_params=ds_config)[0]\n\n        self.model.module.eval()\n        self.model = self.model.module\n\n        # this is the CUDA device for the current process. This will be used\n        # later to identify the GPU on which to transfer tensors\n        self.input_device = torch.cuda.current_device()\n\n        self.post_init(args.model_name)\n"
  },
  {
    "path": "inference_server/models/hf_accelerate.py",
    "content": "from argparse import Namespace\n\nimport torch\n\nfrom ..utils import get_world_size\nfrom .model import Model, get_hf_model_class\n\n\nclass HFAccelerateModel(Model):\n    def __init__(self, args: Namespace) -> None:\n        super().__init__(args)\n\n        kwargs = {\"pretrained_model_name_or_path\": args.model_name, \"device_map\": \"auto\"}\n\n        if get_world_size() > 1:\n            kwargs[\"device_map\"] = \"balanced_low_0\"\n\n        if args.dtype == torch.int8:\n            # using LLM.int8()\n            kwargs[\"load_in_8bit\"] = True\n        else:\n            kwargs[\"torch_dtype\"] = args.dtype\n\n        # this is the CUDA device for the current process. This will be used\n        # later to identify the GPU on which to transfer tensors\n        self.model = get_hf_model_class(args.model_class).from_pretrained(**kwargs)\n\n        self.model.requires_grad_(False)\n        self.model.eval()\n        self.input_device = \"cuda:0\"\n\n        self.post_init(args.model_name)\n"
  },
  {
    "path": "inference_server/models/hf_cpu.py",
    "content": "from argparse import Namespace\n\nfrom .hf_accelerate import HFAccelerateModel\n\n\nclass HFCPUModel(HFAccelerateModel):\n    def __init__(self, args: Namespace) -> None:\n        super().__init__(args)\n        self.input_device = \"cpu\"\n"
  },
  {
    "path": "inference_server/models/model.py",
    "content": "import argparse\nimport copy\nfrom typing import List, Union\n\nimport torch\n\nimport transformers\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig\n\nfrom ..utils import (\n    ForwardRequest,\n    ForwardResponse,\n    GenerateRequest,\n    GenerateResponse,\n    TokenizeRequest,\n    TokenizeResponse,\n)\n\n\nclass Model:\n    def __init__(self, args: argparse.Namespace) -> None:\n        self.model = None\n        self.input_device = None\n        self.max_input_length = args.max_input_length\n        self.max_batch_size = args.max_batch_size\n\n    def post_init(self, model_name: str) -> None:\n        self.is_encoder_decoder = AutoConfig.from_pretrained(model_name).is_encoder_decoder\n        self.generation_config = GenerationConfig.from_model_config(AutoConfig.from_pretrained(model_name))\n        self.tokenizer = load_tokenizer(model_name)\n        self.pad = self.tokenizer.pad_token_id\n        self.prefix_token_id = self.tokenizer(\"A\")[\"input_ids\"][0]\n\n    def get_generation_config(self, request: GenerateRequest) -> GenerationConfig:\n        generation_config = copy.deepcopy(self.generation_config)\n        request = dict(request)\n\n        request_filtered = {}\n        for key, value in request.items():\n            if value is not None and key not in [\"text\", \"remove_input_from_output\"]:\n                request_filtered[key] = value\n        request_filtered[\"return_dict_in_generate\"] = True\n\n        generation_config.update(**request_filtered)\n        return generation_config\n\n    def generate(self, request: GenerateRequest) -> Union[GenerateResponse, Exception]:\n        try:\n            batch_size = len(request.text)\n\n            check_batch_size(batch_size, self.max_batch_size)\n\n            input_tokens = self.tokenizer(request.text, return_tensors=\"pt\", padding=True)\n            max_input_length_in_batch = input_tokens.input_ids[0].shape[0]\n\n            check_max_input_length(max_input_length_in_batch, self.max_input_length)\n\n            for t in input_tokens:\n                if torch.is_tensor(input_tokens[t]):\n                    input_tokens[t] = input_tokens[t].to(self.input_device)\n\n            num_input_tokens = input_tokens[\"input_ids\"].shape[1]\n\n            generation_config = self.get_generation_config(request)\n\n            output = self.model.generate(**input_tokens, generation_config=generation_config)\n\n            output_tokens = output.sequences\n\n            if self.is_encoder_decoder:\n                num_generated_tokens = (output_tokens != self.pad).sum(dim=-1).tolist()\n                generated_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True)\n            else:\n                generated_tokens = output_tokens[:, num_input_tokens:]\n                num_generated_tokens = (generated_tokens != self.pad).sum(dim=-1).tolist()\n\n                if request.remove_input_from_output:\n                    # create the dummy prefix for detokenization\n                    prefix_to_add = torch.tensor([[self.prefix_token_id]] * batch_size).to(self.input_device)\n                    # the generate method's output includes input too. Remove input if\n                    # that is requested by the user\n                    generated_tokens = torch.cat([prefix_to_add, generated_tokens], dim=1)\n                    generated_text = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n                    generated_text = [i[1:] for i in generated_text]\n                else:\n                    generated_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True)\n\n            return GenerateResponse(\n                text=generated_text,\n                num_generated_tokens=num_generated_tokens,\n                is_encoder_decoder=self.is_encoder_decoder,\n            )\n        except Exception as exception:\n            return exception\n\n    def forward(self, request: ForwardRequest) -> Union[ForwardResponse, Exception]:\n        def prepare_tensors(conditioning_tokens: List[List[int]], response_tokens: List[List[int]]):\n            bs = len(conditioning_tokens)\n\n            input_ids = [conditioning_tokens[i] + response_tokens[i] for i in range(bs)]\n            attention_mask = [[1] * (len(conditioning_tokens[i]) + len(response_tokens[i])) for i in range(bs)]\n            labels = [[-100] * len(conditioning_tokens[i]) + response_tokens[i] for i in range(bs)]\n\n            input_ids = pad(input_ids, self.tokenizer.pad_token_id)\n            attention_mask = pad(attention_mask, 0)\n            labels = pad(labels, -100)\n\n            return {\n                \"input_ids\": torch.tensor(input_ids),\n                \"attention_mask\": torch.tensor(attention_mask),\n                \"labels\": torch.tensor(labels),\n            }\n\n        def pad(arrays: list, padding: int, max_length: int = None):\n            if max_length is None:\n                max_length = max(list(map(len, arrays)))\n\n            arrays = [[padding] * (max_length - len(array)) + array for array in arrays]\n            return arrays\n\n        try:\n            batch_size = len(request.conditioning_text)\n\n            check_batch_size(batch_size, self.max_batch_size)\n\n            conditioning_tokens = self.tokenizer(request.conditioning_text)[\"input_ids\"]\n            response_tokens = self.tokenizer(request.response)[\"input_ids\"]\n\n            max_length_in_batch = max([len(conditioning_tokens) + len(response_tokens)])\n            check_max_input_length(max_length_in_batch, self.max_input_length)\n\n            input_tokens = prepare_tensors(conditioning_tokens, response_tokens)\n\n            for t in input_tokens:\n                if torch.is_tensor(input_tokens[t]):\n                    input_tokens[t] = input_tokens[t].to(self.input_device)\n\n            loss = self.model(**input_tokens).loss\n\n            return ForwardResponse(nll=loss.item(), is_encoder_decoder=self.is_encoder_decoder)\n        except Exception as exception:\n            return exception\n\n    def tokenize(self, request: TokenizeRequest) -> TokenizeResponse:\n        return TokenizeResponse(\n            token_ids=self.tokenizer(request.text).input_ids,\n            is_encoder_decoder=self.is_encoder_decoder,\n        )\n\n\ndef check_max_input_length(input_token_length: int, max_input_length: int) -> None:\n    if max_input_length is None:\n        return\n\n    if input_token_length > max_input_length:\n        raise Exception(f\"max supported input length = {max_input_length} for now\")\n\n\ndef check_batch_size(batch_size: int, max_batch_size: int) -> None:\n    if max_batch_size is None:\n        return\n\n    if batch_size > max_batch_size:\n        raise Exception(f\"max supported batch size = {max_batch_size} for now\")\n\n\n# this is a hack for now\ndef get_hf_model_class(model_class: str) -> Union[AutoModelForCausalLM, AutoModelForSeq2SeqLM]:\n    return getattr(transformers, model_class)\n\n\ndef load_tokenizer(model_name: str) -> AutoTokenizer:\n    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side=\"left\")\n\n    if tokenizer.pad_token_id is None:\n        tokenizer.add_special_tokens({\"pad_token\": \"[PAD]\"})\n\n    return tokenizer\n"
  },
  {
    "path": "inference_server/server.py",
    "content": "import os\nfrom functools import partial\n\nfrom flask import Flask, request\nfrom flask_api import status\nfrom pydantic import BaseModel\n\nfrom .constants import HF_ACCELERATE\nfrom .model_handler.deployment import ModelDeployment\nfrom .utils import (\n    ForwardRequest,\n    GenerateRequest,\n    TokenizeRequest,\n    get_exception_response,\n    get_num_tokens_to_generate,\n    get_torch_dtype,\n    parse_bool,\n    run_and_log_time,\n)\n\n\nclass QueryID(BaseModel):\n    generate_query_id: int = 0\n    tokenize_query_id: int = 0\n    forward_query_id: int = 0\n\n\n# placeholder class for getting args. gunicorn does not allow passing args to a\n# python script via ArgumentParser\nclass Args:\n    def __init__(self) -> None:\n        self.deployment_framework = os.getenv(\"DEPLOYMENT_FRAMEWORK\", HF_ACCELERATE)\n        self.model_name = os.getenv(\"MODEL_NAME\")\n        self.model_class = os.getenv(\"MODEL_CLASS\")\n        self.dtype = get_torch_dtype(os.getenv(\"DTYPE\"))\n        self.allowed_max_new_tokens = int(os.getenv(\"ALLOWED_MAX_NEW_TOKENS\", 100))\n        self.max_input_length = int(os.getenv(\"MAX_INPUT_LENGTH\", 512))\n        self.max_batch_size = int(os.getenv(\"MAX_BATCH_SIZE\", 4))\n        self.debug = parse_bool(os.getenv(\"DEBUG\", \"false\"))\n\n\n# ------------------------------------------------------\nargs = Args()\nmodel = ModelDeployment(args, True)\nquery_ids = QueryID()\napp = Flask(__name__)\n# ------------------------------------------------------\n\n\n@app.route(\"/query_id/\", methods=[\"GET\"])\ndef query_id():\n    return query_ids.dict(), status.HTTP_200_OK\n\n\n@app.route(\"/tokenize/\", methods=[\"POST\"])\ndef tokenize():\n    try:\n        x = request.get_json()\n        x = TokenizeRequest(**x)\n\n        response, total_time_taken = run_and_log_time(partial(model.tokenize, request=x))\n\n        response.query_id = query_ids.tokenize_query_id\n        query_ids.tokenize_query_id += 1\n        response.total_time_taken = \"{:.2f} msecs\".format(total_time_taken * 1000)\n\n        return response.dict(), status.HTTP_200_OK\n    except Exception:\n        response = get_exception_response(query_ids.tokenize_query_id, args.debug)\n        query_ids.tokenize_query_id += 1\n        return response, status.HTTP_500_INTERNAL_SERVER_ERROR\n\n\n@app.route(\"/generate/\", methods=[\"POST\"])\ndef generate():\n    try:\n        x = request.get_json()\n        x = GenerateRequest(**x)\n\n        x.max_new_tokens = get_num_tokens_to_generate(x.max_new_tokens, args.allowed_max_new_tokens)\n\n        response, total_time_taken = run_and_log_time(partial(model.generate, request=x))\n\n        response.query_id = query_ids.generate_query_id\n        query_ids.generate_query_id += 1\n        response.total_time_taken = \"{:.2f} secs\".format(total_time_taken)\n\n        return response.dict(), status.HTTP_200_OK\n    except Exception:\n        response = get_exception_response(query_ids.generate_query_id, args.debug)\n        query_ids.generate_query_id += 1\n        return response, status.HTTP_500_INTERNAL_SERVER_ERROR\n\n\n@app.route(\"/forward/\", methods=[\"POST\"])\ndef forward():\n    try:\n        x = request.get_json()\n        x = ForwardRequest(**x)\n\n        if len(x.conditioning_text) != len(x.response):\n            raise Exception(\"unequal number of elements in conditioning_text and response arguments\")\n\n        response, total_time_taken = run_and_log_time(partial(model.forward, request=x))\n\n        response.query_id = query_ids.forward_query_id\n        query_ids.forward_query_id += 1\n        response.total_time_taken = \"{:.2f} secs\".format(total_time_taken)\n\n        return response.dict(), status.HTTP_200_OK\n    except Exception:\n        response = get_exception_response(query_ids.forward_query_id, args.debug)\n        query_ids.forward_query_id += 1\n        return response, status.HTTP_500_INTERNAL_SERVER_ERROR\n"
  },
  {
    "path": "inference_server/utils/__init__.py",
    "content": "from .requests import (\n    ForwardRequest,\n    ForwardResponse,\n    GenerateRequest,\n    GenerateResponse,\n    TokenizeRequest,\n    TokenizeResponse,\n    create_generate_request,\n    get_filter_dict,\n    parse_bool,\n)\nfrom .utils import (\n    get_argument_parser,\n    get_cuda_visible_devices,\n    get_dummy_batch,\n    get_exception_response,\n    get_num_tokens_to_generate,\n    get_str_dtype,\n    get_torch_dtype,\n    get_world_size,\n    pad_ids,\n    parse_args,\n    print_rank_0,\n    run_and_log_time,\n    run_rank_n,\n)\n"
  },
  {
    "path": "inference_server/utils/requests.py",
    "content": "from typing import Any, List\n\nfrom pydantic import BaseModel\n\n\nclass BaseResponse(BaseModel):\n    query_id: int = None\n    total_time_taken: str = None\n\n\nclass GenerateRequest(BaseModel):\n    text: List[str] = None\n    min_length: int = None\n    do_sample: bool = None\n    early_stopping: bool = None\n    temperature: float = None\n    top_k: int = None\n    top_p: float = None\n    typical_p: float = None\n    repetition_penalty: float = None\n    bos_token_id: int = None\n    pad_token_id: int = None\n    eos_token_id: int = None\n    length_penalty: float = None\n    no_repeat_ngram_size: int = None\n    encoder_no_repeat_ngram_size: int = None\n    max_time: float = None\n    max_new_tokens: int = None\n    decoder_start_token_id: int = None\n    diversity_penalty: float = None\n    forced_bos_token_id: int = None\n    forced_eos_token_id: int = None\n    exponential_decay_length_penalty: float = None\n    remove_input_from_output: bool = True\n\n    def get_generate_kwargs(self) -> dict:\n        x = {}\n        for k, v in self.dict().items():\n            if k not in [\"text\", \"method\"] and v is not None:\n                x[k] = v\n        return x\n\n\nclass GenerateResponse(BaseResponse):\n    text: List[str] = None\n    num_generated_tokens: List[int] = None\n    is_encoder_decoder: bool = False\n\n\nclass TokenizeRequest(BaseModel):\n    text: List[str] = None\n\n\nclass TokenizeResponse(BaseResponse):\n    token_ids: List[List[int]] = None\n    is_encoder_decoder: bool = False\n\n\nclass ForwardRequest(BaseModel):\n    conditioning_text: List[str] = None\n    response: List[str] = None\n\n\nclass ForwardResponse(BaseResponse):\n    nll: float = None\n    is_encoder_decoder: bool = False\n\n\ndef parse_bool(value: str) -> bool:\n    if value.lower() == \"true\":\n        return True\n    elif value.lower() == \"false\":\n        return False\n    else:\n        raise ValueError(\"{} is not a valid boolean value\".format(value))\n\n\ndef parse_field(kwargs: dict, field: str, dtype: type, default_value: Any = None) -> Any:\n    if field in kwargs:\n        if type(kwargs[field]) == dtype:\n            return kwargs[field]\n        elif dtype == bool:\n            return parse_bool(kwargs[field])\n        else:\n            return dtype(kwargs[field])\n    else:\n        return default_value\n\n\ndef create_generate_request(text: List[str], generate_kwargs: dict) -> GenerateRequest:\n    # get user generate_kwargs as json and parse it\n    return GenerateRequest(\n        text=text,\n        min_length=parse_field(generate_kwargs, \"min_length\", int),\n        do_sample=parse_field(generate_kwargs, \"do_sample\", bool),\n        early_stopping=parse_field(generate_kwargs, \"early_stopping\", bool),\n        temperature=parse_field(generate_kwargs, \"temperature\", float),\n        top_k=parse_field(generate_kwargs, \"top_k\", int),\n        top_p=parse_field(generate_kwargs, \"top_p\", float),\n        typical_p=parse_field(generate_kwargs, \"typical_p\", float),\n        repetition_penalty=parse_field(generate_kwargs, \"repetition_penalty\", float),\n        bos_token_id=parse_field(generate_kwargs, \"bos_token_id\", int),\n        pad_token_id=parse_field(generate_kwargs, \"pad_token_id\", int),\n        eos_token_id=parse_field(generate_kwargs, \"eos_token_id\", int),\n        length_penalty=parse_field(generate_kwargs, \"length_penalty\", float),\n        no_repeat_ngram_size=parse_field(generate_kwargs, \"no_repeat_ngram_size\", int),\n        encoder_no_repeat_ngram_size=parse_field(generate_kwargs, \"encoder_no_repeat_ngram_size\", int),\n        max_time=parse_field(generate_kwargs, \"max_time\", float),\n        max_new_tokens=parse_field(generate_kwargs, \"max_new_tokens\", int),\n        decoder_start_token_id=parse_field(generate_kwargs, \"decoder_start_token_id\", int),\n        diversity_penalty=parse_field(generate_kwargs, \"diversity_penalty\", float),\n        forced_bos_token_id=parse_field(generate_kwargs, \"forced_bos_token_id\", int),\n        forced_eos_token_id=parse_field(generate_kwargs, \"forced_eos_token_id\", int),\n        exponential_decay_length_penalty=parse_field(generate_kwargs, \"exponential_decay_length_penalty\", float),\n        remove_input_from_output=parse_field(generate_kwargs, \"remove_input_from_output\", bool, True),\n    )\n\n\ndef get_filter_dict(d: BaseModel) -> dict:\n    d = dict(d)\n    q = {}\n    for i in d:\n        if d[i] != None:\n            q[i] = d[i]\n    del q[\"text\"]\n    return q\n"
  },
  {
    "path": "inference_server/utils/utils.py",
    "content": "import argparse\nimport copy\nimport json\nimport math\nimport os\nimport sys\nimport time\nimport traceback\nfrom functools import partial\nfrom typing import Any, Callable, List, Tuple, Union\n\nimport torch\nimport torch.distributed as dist\n\nfrom ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE, HF_CPU\n\n\n# used for benchmarks\ndummy_input_sentences = [\n    \"DeepSpeed is a machine learning framework\",\n    \"He is working on\",\n    \"He has a\",\n    \"He got all\",\n    \"Everyone is happy and I can\",\n    \"The new movie that got Oscar this year\",\n    \"In the far far distance from our galaxy,\",\n    \"Peace is the only way\",\n]\n\n\ndef get_argument_parser() -> argparse.ArgumentParser:\n    parser = argparse.ArgumentParser()\n\n    group = parser.add_argument_group(title=\"model\")\n    group.add_argument(\n        \"--deployment_framework\",\n        type=str,\n        choices=[HF_ACCELERATE, DS_INFERENCE, DS_ZERO, HF_CPU],\n        default=HF_ACCELERATE,\n    )\n    group.add_argument(\n        \"--model_name\",\n        type=str,\n        required=True,\n        help=\"model name to use\",\n    )\n    group.add_argument(\n        \"--model_class\",\n        type=str,\n        required=True,\n        help=\"model class to use\",\n    )\n    group.add_argument(\n        \"--dtype\", type=str, required=True, choices=[\"bf16\", \"fp16\", \"int8\", \"fp32\"], help=\"dtype for model\"\n    )\n    group.add_argument(\n        \"--generate_kwargs\",\n        type=str,\n        default='{\"min_length\": 100, \"max_new_tokens\": 100, \"do_sample\": false}',\n        help=\"generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters\",\n    )\n    group.add_argument(\"--max_input_length\", type=int, help=\"max input length\")\n    group.add_argument(\"--max_batch_size\", type=int, help=\"max supported batch size\")\n\n    return parser\n\n\ndef parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace:\n    args = parser.parse_args()\n\n    args.dtype = get_torch_dtype(args.dtype)\n    args.generate_kwargs = json.loads(args.generate_kwargs)\n\n    return args\n\n\ndef run_rank_n(func: Callable, rank: int = 0, barrier: bool = False) -> None:\n    # wrapper function for the rank to execute on\n    def func_rank_n(*args, **kwargs):\n        output = func(*args, **kwargs)\n        if barrier:\n            dist.barrier()\n        return output\n\n    # a dummy method that doesn't do anything\n    def func_rank_other(*args, **kwargs):\n        if barrier:\n            dist.barrier()\n\n    if dist.is_initialized():\n        if dist.get_rank() == rank:\n            return func_rank_n\n        return func_rank_other\n    else:\n        return func\n\n\n@run_rank_n\ndef print_rank_0(*args, **kwargs) -> None:\n    print(*args, **kwargs)\n\n\ndef get_torch_dtype(dtype_str: str) -> torch.dtype:\n    if dtype_str == \"bf16\":\n        return torch.bfloat16\n    elif dtype_str == \"fp16\":\n        return torch.float16\n    elif dtype_str == \"int8\":\n        return torch.int8\n    elif dtype_str == \"fp32\":\n        return torch.float32\n\n\ndef get_str_dtype(dtype_str: torch.dtype) -> str:\n    if dtype_str == torch.bfloat16:\n        return \"bf16\"\n    elif dtype_str == torch.float16:\n        return \"fp16\"\n    elif dtype_str == torch.int8:\n        return \"int8\"\n    elif dtype_str == torch.float32:\n        return \"fp32\"\n\n\ndef get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]:\n    if input_sentences is None:\n        input_sentences = copy.deepcopy(dummy_input_sentences)\n\n    if batch_size > len(input_sentences):\n        input_sentences *= math.ceil(batch_size / len(input_sentences))\n    input_sentences = input_sentences[:batch_size]\n\n    return input_sentences\n\n\ndef get_num_tokens_to_generate(max_new_tokens: int, allowed_max_new_tokens: int) -> int:\n    if max_new_tokens is None:\n        return allowed_max_new_tokens\n    else:\n        return min(max_new_tokens, allowed_max_new_tokens)\n\n\ndef run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]:\n    # runs a function / list of functions and times them\n    start_time = time.time()\n\n    if type(execs) == list:\n        results = []\n        for f in execs:\n            results.append(f())\n    else:\n        results = execs()\n\n    time_elapsed = time.time() - start_time\n    return results, time_elapsed\n\n\ndef pad_ids(arrays, padding, max_length=-1):\n    # does left padding\n    if max_length < 0:\n        max_length = max(list(map(len, arrays)))\n\n    arrays = [[padding] * (max_length - len(array)) + array for array in arrays]\n\n    return arrays\n\n\ndef get_exception_response(query_id: int, debug: bool = False):\n    e_type, e_message, e_stack_trace = sys.exc_info()\n    response = {\"error\": str(e_type.__name__), \"message\": str(e_message), \"query_id\": query_id}\n\n    if debug:\n        trace_back = traceback.extract_tb(e_stack_trace)\n\n        # Format stacktrace\n        stack_trace = []\n        for trace in trace_back:\n            stack_trace.append(\n                \"File : {}, Line : {}, Func.Name : {}, Message : {}\".format(trace[0], trace[1], trace[2], trace[3])\n            )\n\n        response[\"stack_trace\"] = stack_trace\n\n    return response\n\n\ndef get_world_size() -> int:\n    if dist.is_initialized():\n        return dist.get_world_size()\n    else:\n        cuda_visible_devices = get_cuda_visible_devices()\n        if cuda_visible_devices is None:\n            return 0\n        return len(cuda_visible_devices)\n\n\ndef get_cuda_visible_devices() -> List[int]:\n    cuda_visible_devices = os.getenv(\"CUDA_VISIBLE_DEVICES\")\n    if cuda_visible_devices is not None:\n        cuda_visible_devices = list(map(int, cuda_visible_devices.split(\",\")))\n    return cuda_visible_devices\n"
  },
  {
    "path": "server_request.py",
    "content": "import argparse\n\nimport requests\n\n\ndef get_args() -> argparse.Namespace:\n    parser = argparse.ArgumentParser()\n\n    group = parser.add_argument_group(title=\"launch config\")\n    group.add_argument(\"--host\", type=str, required=True, help=\"host address\")\n    group.add_argument(\"--port\", type=int, required=True, help=\"port number\")\n\n    return parser.parse_args()\n\n\ndef generate(url: str) -> None:\n    url = url + \"/generate/\"\n\n    request_body = {\n        \"text\": [\n            \"DeepSpeed\",\n            \"DeepSpeed is a\",\n            \"DeepSpeed is a machine\",\n            \"DeepSpeed is a machine learning framework\",\n        ],\n        \"max_new_tokens\": 40,\n    }\n    response = requests.post(url=url, json=request_body, verify=False)\n    print(response.json(), \"\\n\")\n\n\ndef tokenize(url: str) -> None:\n    url = url + \"/tokenize/\"\n\n    request_body = {\"text\": [\"DeepSpeed is a\", \"DeepSpeed is a machine learning framework\"]}\n    response = requests.post(url=url, json=request_body, verify=False)\n    print(response.json(), \"\\n\")\n\n\ndef forward(url: str) -> None:\n    url = url + \"/forward/\"\n\n    request_body = {\n        \"conditioning_text\": [\n            \"DeepSpeed\",\n            \"DeepSpeed is a\",\n            \"DeepSpeed is a machine\",\n            \"DeepSpeed is a machine learning framework\",\n        ],\n        \"response\": [\n            \"DeepSpeed\",\n            \"DeepSpeed is a\",\n            \"DeepSpeed is a machine\",\n            \"DeepSpeed is a machine learning framework\",\n        ],\n    }\n    response = requests.post(url=url, json=request_body, verify=False)\n    print(response.json(), \"\\n\")\n\n\ndef query_id(url: str) -> None:\n    url = url + \"/query_id/\"\n\n    response = requests.get(url=url, verify=False)\n    print(response.json(), \"\\n\")\n\n\ndef main():\n    args = get_args()\n    url = \"http://{}:{}\".format(args.host, args.port)\n\n    generate(url)\n    tokenize(url)\n    forward(url)\n    query_id(url)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "setup.cfg",
    "content": "[isort]\ndefault_section = FIRSTPARTY\nensure_newline_before_comments = True\nforce_grid_wrap = 0\ninclude_trailing_comma = True\nknown_first_party = transformers\nknown_third_party =\n    absl\n    conllu\n    datasets\n    elasticsearch\n    fairseq\n    faiss-cpu\n    fastprogress\n    fire\n    fugashi\n    git\n    h5py\n    matplotlib\n    nltk\n    numpy\n    packaging\n    pandas\n    PIL\n    psutil\n    pytest\n    pytorch_lightning\n    rouge_score\n    sacrebleu\n    seqeval\n    sklearn\n    streamlit\n    tensorboardX\n    tensorflow\n    tensorflow_datasets\n    timeout_decorator\n    torch\n    torchaudio\n    torchtext\n    torchvision\n    torch_xla\n    tqdm\n\nline_length = 119\nlines_after_imports = 2\nmulti_line_output = 3\nuse_parentheses = True\n"
  },
  {
    "path": "static/css/style.css",
    "content": "#left-column {\n    width: 80%;\n}\n\n#right-column {\n    width: 18%;\n    float: right;\n    padding-right: 10px;\n}\n\nbody {\n    background-color: lightgray;\n    height: auto;\n}\n\n#text-input {\n    width: 100%;\n    float: left;\n    resize: none;\n}\n\n.slider {\n    width: 100%;\n    float: left;\n}\n\n#log-output {\n    width: 100%;\n    float: left;\n    resize: none;\n}\n\n#max-new-tokens-input {\n    width: 30%;\n    float: left;\n    margin-left: 5px;\n}\n"
  },
  {
    "path": "static/js/index.js",
    "content": "const textGenInput = document.getElementById('text-input');\nconst clickButton = document.getElementById('submit-button');\n\nconst temperatureSlider = document.getElementById('temperature-slider');\nconst temperatureTextBox = document.getElementById('temperature-textbox')\n\nconst top_pSlider = document.getElementById('top_p-slider');\nconst top_pTextBox = document.getElementById('top_p-textbox');\n\nconst top_kSlider = document.getElementById('top_k-slider');\nconst top_kTextBox = document.getElementById('top_k-textbox');\n\nconst repetition_penaltySlider = document.getElementById('repetition_penalty-slider');\nconst repetition_penaltyTextBox = document.getElementById('repetition_penalty-textbox');\n\nconst max_new_tokensInput = document.getElementById('max-new-tokens-input');\n\nconst textLogOutput = document.getElementById('log-output');\n\nfunction get_temperature() {\n    return parseFloat(temperatureSlider.value);\n}\n\ntemperatureSlider.addEventListener('input', async (event) => {\n    temperatureTextBox.innerHTML = \"temperature = \" + get_temperature();\n});\n\nfunction get_top_p() {\n    return parseFloat(top_pSlider.value);\n}\n\ntop_pSlider.addEventListener('input', async (event) => {\n    top_pTextBox.innerHTML = \"top_p = \" + get_top_p();\n});\n\nfunction get_top_k() {\n    return parseInt(top_kSlider.value);\n}\n\ntop_kSlider.addEventListener('input', async (event) => {\n    top_kTextBox.innerHTML = \"top_k = \" + get_top_k();\n});\n\nfunction get_repetition_penalty() {\n    return parseFloat(repetition_penaltySlider.value);\n}\n\nrepetition_penaltySlider.addEventListener('input', async (event) => {\n    repetition_penaltyTextBox.innerHTML = \"repetition_penalty = \" + get_repetition_penalty();\n});\n\nfunction get_max_new_tokens() {\n    return parseInt(max_new_tokensInput.value);\n}\n\nclickButton.addEventListener('click', async (event) => {\n    clickButton.textContent = 'Processing'\n    clickButton.disabled = true;\n\n    var jsonPayload = {\n        text: [textGenInput.value],\n        temperature: get_temperature(),\n        top_k: get_top_k(),\n        top_p: get_top_p(),\n        max_new_tokens: get_max_new_tokens(),\n        repetition_penalty: get_repetition_penalty(),\n        do_sample: true,\n        remove_input_from_output: true\n    };\n\n    if (jsonPayload.temperature == 0) {\n        jsonPayload.do_sample = false;\n    }\n\n    console.log(jsonPayload);\n\n    $.ajax({\n        url: '/generate/',\n        type: 'POST',\n        contentType: \"application/json; charset=utf-8\",\n        data: JSON.stringify(jsonPayload),\n        headers: { 'Access-Control-Allow-Origin': '*' },\n        success: function (response) {\n            var input_text = textGenInput.value;\n\n            if (\"text\" in response) {\n                if (response.is_encoder_decoder) {\n                    textLogOutput.value = response.text[0] + '\\n\\n';\n                } else {\n                    textGenInput.value = input_text + response.text[0];\n                    textLogOutput.value = '';\n                }\n\n                textLogOutput.value += 'total_time_taken = ' + response.total_time_taken + \"\\n\";\n                textLogOutput.value += 'num_generated_tokens = ' + response.num_generated_tokens + \"\\n\";\n                textLogOutput.style.backgroundColor = \"lightblue\";\n            } else {\n                textLogOutput.value = 'total_time_taken = ' + response.total_time_taken + \"\\n\";\n                textLogOutput.value += 'error: ' + response.message;\n                textLogOutput.style.backgroundColor = \"#D65235\";\n            }\n\n            clickButton.textContent = 'Submit';\n            clickButton.disabled = false;\n        },\n        error: function (error) {\n            console.log(JSON.stringify(error, null, 2));\n            clickButton.textContent = 'Submit'\n            clickButton.disabled = false;\n        }\n    });\n});\n"
  },
  {
    "path": "templates/index.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n\n<head>\n    <meta charset=\"UTF-8\" />\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n    <title>Large Models Playground</title>\n    <link href=\"{{ url_for('static', path='css/style.css') }}\" rel=\"stylesheet\">\n    <script type=\"module\" src=\"{{ url_for('static', path='js/index.js') }}\"></script>\n    <script src=\"https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js\"></script>\n</head>\n\n<body>\n    <div id=\"left-column\">\n        <textarea placeholder=\"Input Prompt\" id=\"text-input\" style=\"color: black; background-color: white;\"\n            rows=\"47\"></textarea>\n    </div>\n\n    <div id=\"right-column\">\n        <div>\n            <textbox id=\"temperature-textbox\">\n                temperature = 1\n            </textbox>\n            <input type=\"range\" min=\"0\" max=\"1\" value=\"1\" step=\"0.01\" class=\"slider\" id=\"temperature-slider\">\n        </div>\n\n        <div>\n            <textbox id=\"top_k-textbox\">\n                top_k = 50\n            </textbox>\n            <input type=\"range\" min=\"1\" max=\"100\" value=\"50\" class=\"slider\" id=\"top_k-slider\">\n        </div>\n\n        <div>\n            <textbox id=\"top_p-textbox\">\n                top_p = 1\n            </textbox>\n            <input type=\"range\" min=\"0\" max=\"1\" step=\"0.01\" value=\"1\" class=\"slider\" id=\"top_p-slider\">\n        </div>\n\n        <div>\n            <textbox style=\"float: left;\">\n                max_new_tokens =\n            </textbox>\n            <input type=\"text\" value=\"40\" id=\"max-new-tokens-input\">\n        </div>\n\n        <div>\n            <textbox id=\"repetition_penalty-textbox\">\n                repetition_penalty = 1\n            </textbox>\n            <input type=\"range\" min=\"1\" max=\"3\" step=\"0.01\" value=\"1\" class=\"slider\" id=\"repetition_penalty-slider\">\n        </div>\n\n        <button id=\"submit-button\" style=\"margin-top: 10px;\">Submit</button>\n\n        <div style=\"margin-top: 10px;\">\n            <textarea id=\"log-output\" rows=\"40\" style=\"color: black; background-color: lightblue;\" readonly></textarea>\n        </div>\n    </div>\n</body>\n\n</html>\n"
  },
  {
    "path": "ui.py",
    "content": "import argparse\n\nimport requests\nfrom fastapi import FastAPI, Request\nfrom fastapi.middleware.cors import CORSMiddleware\nfrom fastapi.responses import HTMLResponse, JSONResponse\nfrom fastapi.routing import APIRoute, Mount\nfrom fastapi.staticfiles import StaticFiles\nfrom fastapi.templating import Jinja2Templates\nfrom transformers import AutoTokenizer\nfrom uvicorn import run\n\n\ndef get_args() -> argparse.Namespace:\n    parser = argparse.ArgumentParser()\n\n    group = parser.add_argument_group(title=\"launch config\")\n    group.add_argument(\"--ui_host\", type=str, default=\"127.0.0.1\", help=\"host address for UI\")\n    group.add_argument(\"--ui_port\", type=int, default=5001, help=\"port number for UI\")\n    group.add_argument(\n        \"--generation_backend_host\", type=str, default=\"127.0.0.1\", help=\"host address for generation server\"\n    )\n    group.add_argument(\"--generation_backend_port\", type=int, default=5000, help=\"port number for generation server\")\n\n    return parser.parse_args()\n\n\nclass Server:\n    def __init__(self, args: argparse.Namespace):\n        self.templates = Jinja2Templates(directory=\"templates\")\n        self.ui_host = args.ui_host\n        self.ui_port = args.ui_port\n        self.generation_backend_host = args.generation_backend_host\n        self.generation_backend_port = args.generation_backend_port\n        self.workers = 1\n\n        self.tokenizer = AutoTokenizer.from_pretrained(\"bigscience/bloom\")\n\n        self.app = FastAPI(\n            routes=[\n                APIRoute(\"/\", self.homepage, methods=[\"GET\"], response_class=HTMLResponse),\n                APIRoute(\"/generate/\", self.generate, methods=[\"POST\"]),\n                Mount(\"/static/\", StaticFiles(directory=\"static\"), name=\"static\"),\n            ],\n            timeout=600,\n        )\n\n        self.prefix_checkpoints_list = None\n\n    def homepage(self, request: Request) -> HTMLResponse:\n        return self.templates.TemplateResponse(\"index.html\", {\"request\": request})\n\n    def generate(self, request: dict) -> JSONResponse:\n        response = requests.post(\n            f\"http://{self.generation_backend_host}:{self.generation_backend_port}/generate\",\n            json=request,\n            verify=False,\n        )\n        return JSONResponse(content=response.json())\n\n    def run(self):\n        # get around CORS\n        self.app.add_middleware(\n            CORSMiddleware,\n            allow_origins=[\"*\"],\n            allow_credentials=True,\n            allow_methods=[\"*\"],\n            allow_headers=[\"*\"],\n        )\n\n        run(self.app, host=self.ui_host, port=self.ui_port, workers=self.workers)\n\n\ndef main() -> None:\n    Server(get_args()).run()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  }
]