Repository: huggingface/transformers-bloom-inference Branch: main Commit: 62698bf4b75a Files: 39 Total size: 117.3 KB Directory structure: gitextract_c87qb2fj/ ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── bloom-inference-scripts/ │ ├── README.md │ ├── bloom-accelerate-inference.py │ ├── bloom-ds-inference.py │ └── bloom-ds-zero-inference.py ├── inference_server/ │ ├── benchmark.py │ ├── cli.py │ ├── constants.py │ ├── download_model.py │ ├── model_handler/ │ │ ├── __init__.py │ │ ├── deployment.py │ │ ├── grpc_utils/ │ │ │ ├── __init__.py │ │ │ ├── generation_server.py │ │ │ ├── pb/ │ │ │ │ ├── __init__.py │ │ │ │ ├── generation_pb2.py │ │ │ │ └── generation_pb2_grpc.py │ │ │ └── proto/ │ │ │ └── generation.proto │ │ └── launch.py │ ├── models/ │ │ ├── __init__.py │ │ ├── ds_inference.py │ │ ├── ds_zero.py │ │ ├── hf_accelerate.py │ │ ├── hf_cpu.py │ │ └── model.py │ ├── server.py │ └── utils/ │ ├── __init__.py │ ├── requests.py │ └── utils.py ├── server_request.py ├── setup.cfg ├── static/ │ ├── css/ │ │ └── style.css │ └── js/ │ └── index.js ├── templates/ │ └── index.html └── ui.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ __pycache__/ ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pycqa/isort rev: 5.12.0 hooks: - id: isort name: isort (python) - repo: https://github.com/psf/black rev: 23.1.0 hooks: - id: black args: [--line-length=119,--target-version=py35] ================================================ FILE: Dockerfile ================================================ FROM nvidia/cuda:11.6.1-devel-ubi8 as base RUN dnf install -y --disableplugin=subscription-manager make git && dnf clean all --disableplugin=subscription-manager # taken form pytorch's dockerfile RUN curl -L -o ./miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ chmod +x ./miniconda.sh && \ ./miniconda.sh -b -p /opt/conda && \ rm ./miniconda.sh ENV PYTHON_VERSION=3.9 \ PATH=/opt/conda/envs/inference/bin:/opt/conda/bin:${PATH} # create conda env RUN conda create -n inference python=${PYTHON_VERSION} pip -y # change shell to activate env SHELL ["conda", "run", "-n", "inference", "/bin/bash", "-c"] FROM base as conda # update conda RUN conda update -n base -c defaults conda -y # cmake RUN conda install -c anaconda cmake -y # necessary stuff RUN pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 \ transformers==4.26.1 \ deepspeed==0.7.6 \ accelerate==0.16.0 \ gunicorn==20.1.0 \ flask \ flask_api \ fastapi==0.89.1 \ uvicorn==0.19.0 \ jinja2==3.1.2 \ pydantic==1.10.2 \ huggingface_hub==0.12.1 \ grpcio-tools==1.50.0 \ --no-cache-dir # clean conda env RUN conda clean -ya # change this as you like 🤗 ENV TRANSFORMERS_CACHE=/cos/HF_cache \ HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE} FROM conda as app WORKDIR /src RUN chmod -R g+w /src RUN mkdir /.cache && \ chmod -R g+w /.cache ENV PORT=5000 \ UI_PORT=5001 EXPOSE ${PORT} EXPOSE ${UI_PORT} CMD git clone https://github.com/huggingface/transformers-bloom-inference.git && \ cd transformers-bloom-inference && \ # install grpc and compile protos make gen-proto && \ make bloom-560m ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ gen-proto: mkdir -p inference_server/model_handler/grpc_utils/pb python -m grpc_tools.protoc -Iinference_server/model_handler/grpc_utils/proto --python_out=inference_server/model_handler/grpc_utils/pb --grpc_python_out=inference_server/model_handler/grpc_utils/pb inference_server/model_handler/grpc_utils/proto/generation.proto find inference_server/model_handler/grpc_utils/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \; touch inference_server/model_handler/grpc_utils/__init__.py touch inference_server/model_handler/grpc_utils/pb/__init__.py rm -rf inference_server/model_handler/grpc_utils/pb/*.py-e ui: python -m ui --ui_host 127.0.0.1 --ui_port 5001 --generation_backend_host 127.0.0.1 --generation_backend_port 5000 & # ------------------------- DS inference ------------------------- bloom-176b: make ui TOKENIZERS_PARALLELISM=false \ MODEL_NAME=bigscience/bloom \ MODEL_CLASS=AutoModelForCausalLM \ DEPLOYMENT_FRAMEWORK=ds_inference \ DTYPE=fp16 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=4 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' # loads faster than the above one microsoft-bloom-176b: make ui TOKENIZERS_PARALLELISM=false \ MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \ MODEL_CLASS=AutoModelForCausalLM \ DEPLOYMENT_FRAMEWORK=ds_inference \ DTYPE=fp16 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=4 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' bloomz-176b: make ui TOKENIZERS_PARALLELISM=false \ MODEL_NAME=bigscience/bloomz \ MODEL_CLASS=AutoModelForCausalLM \ DEPLOYMENT_FRAMEWORK=ds_inference \ DTYPE=fp16 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=4 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' bloom-176b-int8: make ui TOKENIZERS_PARALLELISM=false \ MODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \ MODEL_CLASS=AutoModelForCausalLM \ DEPLOYMENT_FRAMEWORK=ds_inference \ DTYPE=int8 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=4 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' # ------------------------- HF accelerate ------------------------- bloom-560m: make ui TOKENIZERS_PARALLELISM=false \ MODEL_NAME=bigscience/bloom-560m \ MODEL_CLASS=AutoModelForCausalLM \ DEPLOYMENT_FRAMEWORK=hf_accelerate \ DTYPE=bf16 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=32 \ CUDA_VISIBLE_DEVICES=0 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' flan-t5-xxl: make ui TOKENIZERS_PARALLELISM=false \ MODEL_NAME=google/flan-t5-xxl \ MODEL_CLASS=AutoModelForSeq2SeqLM \ DEPLOYMENT_FRAMEWORK=hf_accelerate \ DTYPE=bf16 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=4 \ CUDA_VISIBLE_DEVICES=0 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' ul2: make ui TOKENIZERS_PARALLELISM=false \ MODEL_NAME=google/ul2 \ MODEL_CLASS=AutoModelForSeq2SeqLM \ DEPLOYMENT_FRAMEWORK=hf_accelerate \ DTYPE=bf16 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=4 \ CUDA_VISIBLE_DEVICES=0 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' codegen-mono: make ui TOKENIZERS_PARALLELISM=false \ MODEL_NAME=Salesforce/codegen-16B-mono \ MODEL_CLASS=AutoModelForCausalLM \ DEPLOYMENT_FRAMEWORK=hf_accelerate \ DTYPE=bf16 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=4 \ CUDA_VISIBLE_DEVICES=0 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' # ------------------------- HF CPU ------------------------- bloom-560m-cpu: make ui MODEL_NAME=bigscience/bloom-560m \ MODEL_CLASS=AutoModelForCausalLM \ DEPLOYMENT_FRAMEWORK=hf_cpu \ DTYPE=fp32 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=32 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' flan-t5-base-cpu: make ui MODEL_NAME=google/flan-t5-base \ MODEL_CLASS=AutoModelForSeq2SeqLM \ DEPLOYMENT_FRAMEWORK=hf_cpu \ DTYPE=bf16 \ MAX_INPUT_LENGTH=2048 \ MAX_BATCH_SIZE=32 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' ================================================ FILE: README.md ================================================ > [!NOTE] > This repository has been archived and is not being maintained any longer since a lot more efficient serving frameworks have been released recently like vLLM and TGI. # Fast Inference Solutions for BLOOM This repo provides demos and packages to perform fast inference solutions for BLOOM. Some of the solutions have their own repos in which case a link to the [corresponding repos](#Other-inference-solutions) is provided instead. # Inference solutions for BLOOM 176B We support HuggingFace accelerate and DeepSpeed Inference for generation. Install required packages: ```shell pip install flask flask_api gunicorn pydantic accelerate huggingface_hub>=0.9.0 deepspeed>=0.7.3 deepspeed-mii==0.0.2 ``` alternatively you can also install deepspeed from source: ```shell git clone https://github.com/microsoft/DeepSpeed cd DeepSpeed CFLAGS="-I$CONDA_PREFIX/include/" LDFLAGS="-L$CONDA_PREFIX/lib/" TORCH_CUDA_ARCH_LIST="7.0" DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check ``` All the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B (fp16/bf16) and 4 A100 80GB GPUs for BLOOM 176B (int8). These scripts might not work for other models or a different number of GPUs. DS inference is deployed using logic borrowed from DeepSpeed MII library. Note: Sometimes GPU memory is not freed when DS inference deployment crashes. You can free this memory by running `killall python` in terminal. For using BLOOM quantized, use dtype = int8. Also, change the model_name to microsoft/bloom-deepspeed-inference-int8 for DeepSpeed-Inference. For HF accelerate, no change is needed for model_name. HF accelerate uses [LLM.int8()](https://arxiv.org/abs/2208.07339) and DS-inference uses [ZeroQuant](https://arxiv.org/abs/2206.01861) for post-training quantization. ## BLOOM inference via command-line This asks for generate_kwargs everytime. Example: generate_kwargs = ```json {"min_length": 100, "max_new_tokens": 100, "do_sample": false} ``` 1. using HF accelerate ```shell python -m inference_server.cli --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework hf_accelerate --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}' ``` 2. using DS inference ```shell python -m inference_server.cli --model_name microsoft/bloom-deepspeed-inference-fp16 --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}' ``` ## BLOOM server deployment [make ](../Makefile) can be used to launch a generation server. Please note that the serving method is synchronous and users have to wait in queue until the preceding requests have been processed. An example to fire server requests is given [here](./server_request.py). Alternativey, a [Dockerfile](./Dockerfile) is also provided which launches a generation server on port 5000. An interactive UI can be launched via the following command to connect to the generation server. The default URL of the UI is `http://127.0.0.1:5001/`. The `model_name` is just used by the UI to check if the model is decoder or encoder-decoder model. ```shell python -m ui --model_name bigscience/bloom ``` This command launches the following UI to play with generation. Sorry for the crappy design. Unfotunately, my UI skills only go so far. 😅😅😅 ![image](assets/UI.png) ## Benchmark system for BLOOM inference 1. using HF accelerate ```shell python -m inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework hf_accelerate --benchmark_cycles 5 ``` 2. using DS inference ```shell deepspeed --num_gpus 8 --module inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --benchmark_cycles 5 ``` alternatively, to load model faster: ```shell deepspeed --num_gpus 8 --module inference_server.benchmark --model_name microsoft/bloom-deepspeed-inference-fp16 --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --benchmark_cycles 5 ``` 3. using DS ZeRO ```shell deepspeed --num_gpus 8 --module inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5 ``` # Support If you run into things not working or have other questions please open an Issue in the corresponding backend: - [Accelerate](https://github.com/huggingface/accelerate/issues) - [Deepspeed-Inference](https://github.com/microsoft/DeepSpeed/issues) - [Deepspeed-ZeRO](https://github.com/microsoft/DeepSpeed/issues) If there a specific issue with one of the scripts and not the backend only then please open an Issue here and tag [@mayank31398](https://github.com/mayank31398). # Other inference solutions ## Client-side solutions Solutions developed to perform large batch inference locally: * [Custom HF Code](https://github.com/huggingface/transformers_bloom_parallel/). JAX: * [BLOOM Inference in JAX](https://github.com/huggingface/bloom-jax-inference) ## Server solutions A solution developed to be used in a server mode (i.e. varied batch size, varied request rate) can be found [here](https://github.com/Narsil/bloomserver). This is implemented in Rust. ================================================ FILE: bloom-inference-scripts/README.md ================================================ # Inference scripts for BLOOM ## BLOOM Inference solutions Here are some benchmark resuls on JeanZay's 8x80GB A100 node w/ 512GB of CPU memory: All benchmarks are doing greedy generation of 100 token outputs: ``` Generate args {'max_length': 100, 'do_sample': False} ``` The input prompt is comprised of just a few tokens. Throughput in msecs on 8x80GB gpus: | project \ bs | 1 | 8 | 16 | 32 | 64 | 128 | 256 | 512 | | :---------------- | :----- | :---- | :---- | :---- | :--- | :--- | :--- | :--- | | accelerate bf16 | 230.38 | 31.78 | 17.84 | 10.89 | oom | | | | | accelerate int8 | 286.56 | 40.92 | 22.65 | 13.27 | oom | | | | | ds-inference fp16 | 44.02 | 5.70 | 3.01 | 1.68 | 1.00 | 0.69 | oom | | | ds-inference int8 | 89.09 | 11.44 | 5.88 | 3.09 | 1.71 | 1.02 | 0.71 | oom | | ds-zero bf16 | 283 | 34.88 | oom | | | | | | note: Since Deepspeed-ZeRO can process multiple generate streams in parallel its throughput can be further divided by 8 or 16, depending on whether 8 or 16 gpus were used during the generate. and, of course, it means that it can process a bs of 64 in the case of 8x80 A100 (the table above). Start to ready to generate in secs (mainly loading and data preparation time): | project | | | :---------------------- | :--- | | accelerate | 121 | | ds-inference shard-int8 | 61 | | ds-inference shard-fp16 | 60 | | ds-inference unsharded | 662 | | ds-zero | 462 | Now let's look at the power of quantized int8-based models provided by [Deepspeed-Inference](https://www.deepspeed.ai/tutorials/inference-tutorial/) and [BitsNBytes](https://github.com/TimDettmers/bitsandbytes), as it requires only half the original GPU memory of inference in bfloat16 or float16. Throughput in msecs 4x80GB A100: | project \ bs | 1 | 8 | 16 | 32 | 64 | 128 | | :---------------- | :----- | :---- | :---- | :---- | :--- | :--- | | accelerate int8 | 284.15 | 40.14 | 21.97 | oom | | | | ds-inference int8 | 156.51 | 20.11 | 10.38 | 5.50 | 2.96 | oom | To get the benchmark results simply add `--benchmark` to any of these 3 scripts discussed below. ## Deepspeed-Inference Deepspeed-Inference uses Tensor-Parallelism and efficient fused CUDA kernels: https://www.deepspeed.ai/tutorials/inference-tutorial/ ### Setup ``` pip install deepspeed>=0.7.3 ``` ### Run 1. the fastest approach is to use a tp-pre-sharded checkpoint that takes only ~1min to load, as compared to 10min for non-presharded bloom checkpoint ``` deepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-fp16 ``` 1a. if you want to run the original bloom checkpoint, which once loaded will run at the same throughput as the previous solution, but the loading will take 10-20min: ``` deepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name bigscience/bloom ``` 2a. The 8bit quantized version requires you to have only half the GPU memory of the normal half precision version: ``` deepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-int8 --dtype int8 ``` Here we used `microsoft/bloom-deepspeed-inference-int8` and also told the script to run in `int8`. And of course, just 4x80GB A100 gpus is now sufficient: ``` deepspeed --num_gpus 4 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-int8 --dtype int8 ``` ## HF Accelerate HF Accelerate can use naive Pipeline Parallelism to load a huge model over multiple GPUs: https://github.com/huggingface/accelerate ### Setup ``` pip install transformers>=4.21.3 accelerate>=0.12.0 ``` ### Run ``` python bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-accelerate-inference_bs=1.txt ``` To activate the 8bit quantized solution first install `bitsnbytes`: ``` pip install bitsandbytes ``` and then add `--dtype int8` to the previous command line: ``` python bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --dtype int8 --batch_size 1 --benchmark 2>&1 | tee bloom-int8-accelerate-inference_bs=1.txt ``` if you have more than 4 GPUs you can tell it to use only 4 with: ``` CUDA_VISIBLE_DEVICES=0,1,2,3 python bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --dtype int8 --batch_size 1 --benchmark 2>&1 | tee bloom-int8-accelerate-inference_bs=1.txt ``` ## Deepspeed ZeRO-Inference [Deepspeed ZeRO](https://www.deepspeed.ai/tutorials/zero/) uses a magical sharding approach which can take almost any model and scale it across a few or hundreds of GPUs. ### Setup ``` pip install deepspeed ``` ### Run Note that the script currently runs the same inputs on all GPUs, but you can run a different stream on each GPU, and get `n_gpu` times faster throughput. You can't do that with Deepspeed-Inference. ``` deepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt ``` Please remember that with ZeRO the user can generate multiple unique streams at the same time - and thus the overall performance should be throughput in secs/token divided by number of participating gpus - so 8x to 16x faster depending on whether 8 or 16 gpus were used! You can also try the offloading solutions with just one small GPU, which will take a long time to run, but if you don't have 8 huge GPUs this is as good as it gets. CPU-Offload (1x gpus): ``` deepspeed --num_gpus 1 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --cpu_offload --benchmark 2>&1 | tee bloom-ds-zero-inference-cpu_offload_bs=8.txt ``` NVMe-Offload (1x gpus): ``` deepspeed --num_gpus 1 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --nvme_offload_path=/path/to/nvme_offload --benchmark 2>&1 | tee bloom-ds-zero-inference-nvme_offload_bs=8.txt ``` make sure to adjust `/path/to/nvme_offload` to somewhere you have ~400GB of free memory on a fast NVMe drive. ## Support If you run into things not working or have other questions please open an Issue in the corresponding backend: - [Accelerate](https://github.com/huggingface/accelerate/issues) - [Deepspeed-Inference](https://github.com/microsoft/DeepSpeed/issues) - [Deepspeed-ZeRO](https://github.com/microsoft/DeepSpeed/issues) If there a specific issue with one of the scripts and not the backend only then please open an Issue here and tag [@stas00](https://github.com/stas00). ================================================ FILE: bloom-inference-scripts/bloom-accelerate-inference.py ================================================ import argparse import gc import math import os import time import torch import torch.distributed as dist from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") parser.add_argument("--name", type=str, help="Name path", required=True) parser.add_argument("--batch_size", default=1, type=int, help="batch size") parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") parser.add_argument("--greedy", action="store_true") parser.add_argument("--top-k", type=int, default=0) parser.add_argument("--top-p", type=float, default=0.0) parser.add_argument("--dtype", type=str, help="float16 or int8", choices=["int8", "float16"], default="float16") return parser.parse_args() t_start = time.time() num_tokens = 100 args = get_args() local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = torch.cuda.device_count() rank = local_rank def print_rank0(*msg): if rank != 0: return print(*msg) print_rank0(f"Using {world_size} gpus") model_name = args.name print_rank0(f"Loading model {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) # XXX: can't automatically derive dtype via config's `from_pretrained` dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 # print(get_max_memory_per_gpu_dict()) infer_dtype = args.dtype if infer_dtype == "int8": dtype = torch.int8 kwargs = dict( device_map="auto", ) def get_world_size() -> int: if dist.is_initialized(): return dist.get_world_size() else: return 1 # balanced_low_0 - because it allows a larger batch size with multiple GPUs if get_world_size() > 1: kwargs["device_map"] = "balanced_low_0" if infer_dtype == "int8": print_rank0("Using `load_in_8bit=True` to use quanitized model") kwargs["load_in_8bit"] = True else: kwargs["torch_dtype"] = dtype model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs) if args.benchmark: t_ready = time.time() ### Generate print_rank0(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") input_sentences = [ "DeepSpeed is a machine learning framework", "He is working on", "He has a", "He got all", "Everyone is happy and I can", "The new movie that got Oscar this year", "In the far far distance from our galaxy,", "Peace is the only way", ] if args.batch_size > len(input_sentences): # dynamically extend to support larger bs by repetition input_sentences *= math.ceil(args.batch_size / len(input_sentences)) generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) # generate_kwargs = dict(max_new_tokens=num_tokens, use_cache=False, do_sample=False) # generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) print_rank0(f"Generate args {generate_kwargs}") inputs = input_sentences[: args.batch_size] def generate(): """returns a list of zipped inputs, outputs and number of new tokens""" input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) for t in input_tokens: if torch.is_tensor(input_tokens[t]): input_tokens[t] = input_tokens[t].to("cuda:0") outputs = model.generate(**input_tokens, **generate_kwargs) input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] output_tokens_lengths = [x.shape[0] for x in outputs] total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)] outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) return zip(inputs, outputs, total_new_tokens) print_rank0("*** Running generate") t_generate_start = time.time() generated = generate() t_generate_span = time.time() - t_generate_start for i, o, _ in generated: print_rank0(f"{'-'*60}\nin={i}\nout={o}\n") ### Benchmark if args.benchmark: # clear cache / free memory torch.cuda.empty_cache() gc.collect() print_rank0("*** Running benchmark") # warm up for i in range(1): _ = generate() torch.cuda.synchronize() # benchmark t0 = time.time() cycles = 5 total_new_tokens_generated = 0 for i in range(cycles): generated = generate() total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated) torch.cuda.synchronize() throughput = (time.time() - t0) / (total_new_tokens_generated) print_rank0( f""" *** Performance stats: Throughput per token including tokenize: {throughput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """ ) ================================================ FILE: bloom-inference-scripts/bloom-ds-inference.py ================================================ # usage: # deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom # # to run benchmarks: # deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark # # This is going to improve, but at the moment, the process is a bit cumbersome - we first use # 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints, # 2. free the allocated storage # 3. start Deepspeed-Inference and only now load the checkpoint # 4. run generate # Done. # import gc import io import json import math import os import time from argparse import ArgumentParser from pathlib import Path import torch import torch.distributed as dist import deepspeed from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock from transformers.utils import is_offline_mode # the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time. tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"] t_start = time.time() num_tokens = 100 parser = ArgumentParser() parser.add_argument("--name", required=True, type=str, help="model_name") parser.add_argument("--dtype", type=str, help="float16 or int8", choices=["int8", "float16"], default="float16") parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") parser.add_argument("--batch_size", default=1, type=int, help="batch size") parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") args = parser.parse_args() local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "1")) deepspeed.init_distributed("nccl") rank = dist.get_rank() def print_rank0(*msg): if rank != 0: return print(*msg) ### Model loading and instantiating on GPUs def get_repo_root(model_name_or_path): # checks if online or not if is_offline_mode(): print_rank0("Offline mode: forcing local_files_only=True") # download only on first process if rank == 0: snapshot_download( model_name_or_path, local_files_only=is_offline_mode(), cache_dir=os.getenv("TRANSFORMERS_CACHE", None), ignore_patterns=["*.safetensors"], ) dist.barrier() return snapshot_download( model_name_or_path, local_files_only=is_offline_mode(), cache_dir=os.getenv("TRANSFORMERS_CACHE", None), ignore_patterns=["*.safetensors"], ) def get_checkpoint_files(model_name_or_path): cached_repo_dir = get_repo_root(model_name_or_path) # extensions: .bin | .pt # creates a list of paths from all downloaded files in cache dir file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()] return file_list model_name = args.name infer_dtype = args.dtype tp_presharded_mode = True if model_name in tp_presharded_models else False # print(get_checkpoint_files(model_name)) print_rank0(f"*** Loading the model {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name) # XXX: can't automatically derive dtype via config's `from_pretrained` # dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 # use one of these args to `init_inference` # 1. injection_policy is the slower version, but it's plain pytorch so it'll always work # 2. replace_with_kernel_inject is the faster one (fast fused kernels) kernel_inject = True # kernel_inject = False if kernel_inject: # XXX: for now ds-inference only works with fp16 dtype = torch.float16 else: dtype = torch.bfloat16 if args.benchmark: torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage("pre-from-pretrained", force=True) # Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load with deepspeed.OnDevice(dtype=dtype, device="meta"): model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16) if args.benchmark: deepspeed.runtime.utils.see_memory_usage("post-from-pretrained", force=True) model = model.eval() if args.benchmark: torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage("post-init-ds-zero-init", force=True) ### Deepspeed-Inference Loading checkpoints_json = "checkpoints.json" def write_checkpoints_json(): checkpoint_files = get_checkpoint_files(model_name) if rank == 0: data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0} json.dump(data, open(checkpoints_json, "w")) if args.benchmark: torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage("pre-ds-inference-init", force=True) if kernel_inject: kwargs = dict(replace_with_kernel_inject=True) else: kwargs = dict(injection_policy={BloomBlock: ("self_attention.dense", "mlp.dense_4h_to_h")}) repo_root = get_repo_root(model_name) if tp_presharded_mode: # tp presharded repos come with their own checkpoints config file checkpoints_json = os.path.join(repo_root, "ds_inference_config.json") else: # for normal bloom repo we need to write the checkpoints config file write_checkpoints_json() dist.barrier() # checkpoints_json=None model = deepspeed.init_inference( model, mp_size=world_size, base_dir=repo_root, dtype=getattr(torch, infer_dtype), checkpoint=checkpoints_json, **kwargs, ) if args.benchmark: torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage("post-ds-inference-init", force=True) model = model.module if args.benchmark: t_ready = time.time() ### Generate print_rank0(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") input_sentences = [ "DeepSpeed is a machine learning framework", "He is working on", "He has a", "He got all", "Everyone is happy and I can", "The new movie that got Oscar this year", "In the far far distance from our galaxy,", "Peace is the only way", ] if args.batch_size > len(input_sentences): # dynamically extend to support larger bs by repetition input_sentences *= math.ceil(args.batch_size / len(input_sentences)) generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) print_rank0(f"Generate args {generate_kwargs}") inputs = input_sentences[: args.batch_size] def generate(): """returns a list of zipped inputs, outputs and number of new tokens""" input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) for t in input_tokens: if torch.is_tensor(input_tokens[t]): input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) outputs = model.generate(**input_tokens, **generate_kwargs) input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] output_tokens_lengths = [x.shape[0] for x in outputs] total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)] outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) return zip(inputs, outputs, total_new_tokens) # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs print_rank0("*** Running generate warmup") _ = generate() print_rank0("*** Running generate") t_generate_start = time.time() generated = generate() t_generate_span = time.time() - t_generate_start for i, o, _ in generated: print_rank0(f"{'-'*60}\nin={i}\nout={o}\n") if args.benchmark: torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage("end-of-run", force=True) ### Benchmark # benchmark it! if args.benchmark: print_rank0("*** Running benchmark") # warm up for i in range(1): _ = generate() torch.cuda.synchronize() # benchmark t0 = time.time() cycles = 5 total_new_tokens_generated = 0 for i in range(cycles): generated = generate() total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated) torch.cuda.synchronize() throughput = (time.time() - t0) / (total_new_tokens_generated) print_rank0( f""" *** Performance stats: Throughput per token including tokenize: {throughput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """ ) ================================================ FILE: bloom-inference-scripts/bloom-ds-zero-inference.py ================================================ # usage: # deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom # # to run benchmarks: # deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --benchmark # # This is going to improve, but at the moment, the process is a bit cumbersome - we first use # 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints, # 2. free the allocated storage # 3. start Deepspeed-Inference and only now load the checkpoint # 4. run generate # Done. # import gc import math import os import time from argparse import ArgumentParser import torch import torch.distributed as dist import deepspeed from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.deepspeed import HfDeepSpeedConfig from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock t_start = time.time() num_tokens = 100 parser = ArgumentParser() parser.add_argument("--name", required=True, type=str, help="model_name") parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") parser.add_argument("--batch_size", default=1, type=int, help="batch size") parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") parser.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload") parser.add_argument("--nvme_offload_path", help="whether to activate NVME offload and the path on nvme") args = parser.parse_args() local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "1")) deepspeed.init_distributed("nccl") rank = dist.get_rank() def print_rank0(*msg): if rank != 0: return print(*msg) ### Model loading and instantiating on GPU (via ZeRO) model_name = args.name print_rank0(f"*** Loading the model {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name) # XXX: can't automatically derive dtype via config's `from_pretrained` dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 model_hidden_size = config.hidden_size train_batch_size = 1 * world_size ds_config = { "fp16": { "enabled": dtype == torch.float16, }, "bf16": { "enabled": dtype == torch.bfloat16, }, "zero_optimization": { "stage": 3, "overlap_comm": True, "contiguous_gradients": True, "reduce_bucket_size": model_hidden_size * model_hidden_size, "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, "stage3_param_persistence_threshold": 0, }, "steps_per_print": 2000, "train_batch_size": train_batch_size, "train_micro_batch_size_per_gpu": 1, "wall_clock_breakdown": False, } if args.cpu_offload and args.nvme_offload_path: raise ValueError("Use one of --cpu_offload or --nvme_offload_path and not both") if args.cpu_offload: ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True) if args.nvme_offload_path: ds_config["zero_optimization"]["offload_param"] = dict( device="nvme", pin_memory=True, nvme_path=args.nvme_offload_path, buffer_size=4e9, ) dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus if args.benchmark: torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage("pre-from-pretrained", force=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) if args.benchmark: deepspeed.runtime.utils.see_memory_usage("post-from-pretrained", force=True) model = model.eval() print_rank0(ds_config) ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] ds_engine.module.eval() model = ds_engine.module if args.benchmark: t_ready = time.time() deepspeed.runtime.utils.see_memory_usage("start-of-generate", force=True) ### Generate print_rank0(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") input_sentences = [ "DeepSpeed is a machine learning framework", "He is working on", "He has a", "He got all", "Everyone is happy and I can", "The new movie that got Oscar this year", "In the far far distance from our galaxy,", "Peace is the only way", ] if args.batch_size > len(input_sentences): # dynamically extend to support larger bs by repetition input_sentences *= math.ceil(args.batch_size / len(input_sentences)) generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) # Important: if using multiple unique streams to avoid hanging if one generation finished early - one must also add: # generate_kwargs.update(synced_gpus=True) print_rank0(f"Generate args {generate_kwargs}") inputs = input_sentences[: args.batch_size] def generate(): """returns a list of zipped inputs, outputs and number of new tokens""" input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) for t in input_tokens: if torch.is_tensor(input_tokens[t]): input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) outputs = model.generate(**input_tokens, **generate_kwargs) input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] output_tokens_lengths = [x.shape[0] for x in outputs] total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)] outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) return zip(inputs, outputs, total_new_tokens) # XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size print_rank0("*** Running generate") t_generate_start = time.time() pairs = generate() t_generate_span = time.time() - t_generate_start for i, o, _ in pairs: print_rank0(f"{'-'*60}\nin={i}\nout={o}\n") ### Benchmark if args.benchmark: # clear cache / free memory torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage("end-of-generate", force=True) print_rank0("*** Running benchmark") # warm up for i in range(1): _ = generate() torch.cuda.synchronize() # benchmark t0 = time.time() cycles = 5 total_new_tokens_generated = 0 for i in range(cycles): generated = generate() total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated) torch.cuda.synchronize() # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs) total_new_tokens_generated *= world_size throughput = (time.time() - t0) / (total_new_tokens_generated) print_rank0( f""" *** Performance stats: Throughput per token including tokenize: {throughput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """ ) ================================================ FILE: inference_server/benchmark.py ================================================ import argparse import gc from functools import partial import torch from .constants import DS_INFERENCE, DS_ZERO from .model_handler.deployment import ModelDeployment from .models import start_inference_engine from .utils import ( GenerateRequest, create_generate_request, get_argument_parser, get_dummy_batch, get_world_size, parse_args, print_rank_0, run_and_log_time, ) def benchmark_generation(model: ModelDeployment, request: GenerateRequest, cycles: int = 5): # run benchmarks for number of cycles total_new_tokens_generated = 0 for _ in range(cycles): response = model.generate(request=request) total_new_tokens_generated += sum(new_tokens for new_tokens in response.num_generated_tokens) return total_new_tokens_generated def get_benchmark_results( benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int ) -> str: throughput = total_new_tokens_generated / benchmark_time latency = benchmark_time / cycles return f""" *** Performance stats: Throughput (including tokenization) = {throughput:.2f} tokens/sec Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token Model loading time = {initialization_time:.2f} secs Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} Latency = {latency:.2f} secs Model loading time + generation time per batch = {initialization_time + latency:.2f} secs """ def benchmark_end_to_end(args: argparse.Namespace) -> None: model, initialization_time = run_and_log_time(partial(ModelDeployment, args=args, grpc_allowed=False)) request = create_generate_request(get_dummy_batch(args.batch_size), args.generate_kwargs) print_rank_0(f"generate_kwargs = {args.generate_kwargs}") print_rank_0(f"batch_size = {args.batch_size}") # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs response = model.generate(request=request) for i, (o, _) in zip(request.text, zip(response.text, response.num_generated_tokens)): print_rank_0(f"{'-' * 60}\nin = {i}\nout = {o}\n") if args.benchmark_cycles > 0: print_rank_0("*** Running benchmark") torch.cuda.empty_cache() gc.collect() # warm up model.generate(request=request) torch.cuda.synchronize() # benchmark total_new_tokens_generated, benchmark_time = run_and_log_time( partial(benchmark_generation, model=model, request=request, cycles=args.benchmark_cycles) ) # with ZeRO every GPU is generating batch_size * sequence_length tokens if args.deployment_framework == DS_ZERO: total_new_tokens_generated *= get_world_size() print_rank_0( get_benchmark_results( benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles ) ) def get_args() -> argparse.Namespace: parser = get_argument_parser() group = parser.add_argument_group(title="launch config") group.add_argument("--benchmark_cycles", type=int, default=0, help="additionally run benchmark") group.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") group.add_argument("--batch_size", default=1, type=int, help="batch size") group.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload for DS ZeRO") args = parse_args(parser) launched_with_deepspeed = args.deployment_framework in [DS_INFERENCE, DS_ZERO] assert args.max_batch_size == None, "max_batch_size is not supported with benchmark" if not launched_with_deepspeed: assert args.local_rank == None, "local_rank must be None if not launched with DeepSpeed" if args.cpu_offload: assert args.deployment_framework == DS_ZERO, "cpu_offload only works with DS_ZeRO" return args def main() -> None: args = get_args() start_inference_engine(args.deployment_framework) benchmark_end_to_end(args) if __name__ == "__main__": main() ================================================ FILE: inference_server/cli.py ================================================ import argparse import json import sys from .model_handler import ModelDeployment from .utils import get_argument_parser, parse_args, print_rank_0 def get_args() -> argparse.Namespace: parser = get_argument_parser() args = parse_args(parser) return args def main() -> None: args = get_args() model = ModelDeployment(args, True) generate_kwargs = args.generate_kwargs while True: input_text = input("Input text: ") if input("change generate_kwargs? [y/n] ") == "y": while True: try: generate_kwargs = json.loads(input("Generate kwargs: ")) break except Exception as e: e_type, e_message, _ = sys.exc_info() print("error =", e_type.__name__) print("message =", e_message) continue response = model.generate(text=[input_text], generate_kwargs=generate_kwargs) print_rank_0("Output text:", response.text[0]) print_rank_0("Generated tokens:", response.num_generated_tokens[0]) if __name__ == "__main__": main() ================================================ FILE: inference_server/constants.py ================================================ # inference method (args.deployment_framework) HF_ACCELERATE = "hf_accelerate" HF_CPU = "hf_cpu" DS_INFERENCE = "ds_inference" DS_ZERO = "ds_zero" # GRPC_MAX_MSG_SIZE = 2**30 # 1GB ================================================ FILE: inference_server/download_model.py ================================================ import argparse from inference_server.models import get_hf_model_class from transformers import AutoConfig, AutoTokenizer def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( "--model_name", type=str, required=True, help="model to use", ) parser.add_argument( "--model_class", type=str, required=True, help="model class to use", ) args = parser.parse_args() return args def main() -> None: args = get_args() print("downloading", args.model_name) AutoConfig.from_pretrained(args.model_name) AutoTokenizer.from_pretrained(args.model_name) get_hf_model_class(args.model_class).from_pretrained(args.model_name) if __name__ == "__main__": main() ================================================ FILE: inference_server/model_handler/__init__.py ================================================ from .deployment import ModelDeployment ================================================ FILE: inference_server/model_handler/deployment.py ================================================ """ Copyright 2022 The Microsoft DeepSpeed Team """ import argparse import asyncio import subprocess import time from typing import List import grpc from ..constants import DS_INFERENCE, DS_ZERO from ..models import get_model_class, load_tokenizer from ..utils import ( ForwardRequest, ForwardResponse, GenerateResponse, TokenizeRequest, TokenizeResponse, create_generate_request, get_cuda_visible_devices, get_str_dtype, get_world_size, print_rank_0, ) from .grpc_utils.pb import generation_pb2, generation_pb2_grpc class ModelDeployment: def __init__(self, args: argparse.Namespace, grpc_allowed: bool = False): self.cuda_visible_devices = get_cuda_visible_devices() self.num_gpus = get_world_size() self.use_grpc_server = self.should_use_grpc(args.deployment_framework, grpc_allowed) if self.use_grpc_server: self.tokenizer = load_tokenizer(args.model_name) self.initialize_ports() self.dtype_proto_field = { str: "svalue", int: "ivalue", float: "fvalue", bool: "bvalue", } self._initialize_service(args) self._wait_until_server_is_live() self.asyncio_loop = asyncio.get_event_loop() self._initialize_grpc_client() else: self.model = get_model_class(args.deployment_framework)(args) print_rank_0("model loaded") def should_use_grpc(self, deployment_framework: str, grpc_allowed: bool) -> bool: if grpc_allowed and get_world_size() > 1: return deployment_framework in [DS_INFERENCE, DS_ZERO] return False def initialize_ports(self): self.ports = [] for i in range(self.num_gpus): self.ports.append(50950 + self.cuda_visible_devices[i]) def _is_socket_open(self, port): import socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) result = sock.connect_ex(("0.0.0.0", port)) sock.close() return result == 0 def _is_server_process_alive(self): if self.process is None: return True try: self.process.wait(1) except subprocess.TimeoutExpired as err: # timeout means we're still running and all (probably) okay is_alive = True else: # no exception case is_alive = False return is_alive def _wait_until_server_is_live(self): sockets_open = False while not sockets_open: sockets_open = self._is_socket_open(self.ports[0]) process_alive = self._is_server_process_alive() if not process_alive: raise RuntimeError("server crashed for some reason, unable to proceed") time.sleep(4) print_rank_0("waiting for server to start...") print_rank_0(f"server has started on {self.ports[0]}") def dict_to_proto(self, generate_kwargs: dict) -> dict: result = {} for k, v in generate_kwargs.items(): if v is not None: x = generation_pb2.Value() setattr(x, self.dtype_proto_field[type(v)], v) result[k] = x return result def _initialize_service(self, args: argparse.Namespace): if self._is_socket_open(self.ports[0]): raise RuntimeError( f"Server is already running on port {self.ports}, please shutdown or use different port." ) if args.deployment_framework in [DS_INFERENCE, DS_ZERO]: ports = " ".join(map(str, self.ports)) cmd = f"inference_server.model_handler.launch --model_name {args.model_name} --deployment_framework {args.deployment_framework} --dtype {get_str_dtype(args.dtype)} --port {ports} --model_class {args.model_class}" if args.max_batch_size is not None: cmd += f" --max_batch_size {args.max_batch_size}" if args.max_input_length is not None: cmd += f" --max_input_length {args.max_input_length}" master_port = 29500 + min(self.cuda_visible_devices) cuda_visible_devices = ",".join(map(str, self.cuda_visible_devices)) cmd = f"deepspeed --master_port {master_port} --include localhost:{cuda_visible_devices} --module {cmd}" else: raise NotImplementedError(f"unsupported deployment_framework: {args.deployment_framework}") cmd = cmd.split(" ") self.process = subprocess.Popen(cmd) def _initialize_grpc_client(self): self.stubs = [] for i in self.ports: channel = grpc.aio.insecure_channel(f"localhost:{i}") stub = generation_pb2_grpc.GenerationServiceStub(channel) self.stubs.append(stub) # runs task in parallel and return the result from the first task async def generate_in_tensor_parallel(self, text: List[str], generate_kwargs: dict): responses = [] for i in range(self.num_gpus): responses.append(self.asyncio_loop.create_task(self.generate_async(i, text, generate_kwargs))) await responses[0] return responses[0] async def generate_async(self, stub_id: int, text: List[str], generate_kwargs: dict): req = generation_pb2.GenerationRequestProto(texts=text, generate_kwargs=generate_kwargs) response = await self.stubs[stub_id].Generate(req) return response # runs task in parallel and return the result from the first task async def forward_in_tensor_parallel(self, conditioning_text: List[str], response: List[str]): responses = [] for i in range(self.num_gpus): responses.append(self.asyncio_loop.create_task(self.forward_async(i, conditioning_text, response))) await responses[0] return responses[0] async def forward_async(self, stub_id: int, conditioning_text: List[str], response: List[str]): req = generation_pb2.ForwardRequestProto(conditioning_text=conditioning_text, response=response) response = await self.stubs[stub_id].Forward(req) return response def generate(self, **kwargs) -> GenerateResponse: if self.use_grpc_server: if "request" in kwargs: text = kwargs["request"].text generate_kwargs = kwargs["request"].get_generate_kwargs() else: text = kwargs["text"] generate_kwargs = kwargs["generate_kwargs"] generate_kwargs = self.dict_to_proto(generate_kwargs) response = self.asyncio_loop.run_until_complete( self.generate_in_tensor_parallel(text, generate_kwargs) ).result() if response.error: raise Exception(response.error) else: return GenerateResponse( text=[r for r in response.texts], num_generated_tokens=[n for n in response.num_generated_tokens] ) else: if "request" in kwargs: request = kwargs["request"] else: request = create_generate_request(**kwargs) response = self.model.generate(request) if isinstance(response, Exception): raise response else: return response def forward(self, request: ForwardRequest) -> ForwardResponse: if self.use_grpc_server: response = self.asyncio_loop.run_until_complete( self.forward_in_tensor_parallel(request.conditioning_text, request.response) ).result() if response.error: raise Exception(response.error) else: return ForwardResponse(nll=response.nll) else: response = self.model.forward(request) if isinstance(response, Exception): raise response else: return response def tokenize(self, request: TokenizeRequest) -> TokenizeResponse: if self.use_grpc_server: response = self.tokenizer(request.text, padding=request.padding) response = TokenizeResponse(token_ids=response.input_ids, attention_mask=response.attention_mask) else: response = self.model.tokenize(request) return response ================================================ FILE: inference_server/model_handler/grpc_utils/__init__.py ================================================ ================================================ FILE: inference_server/model_handler/grpc_utils/generation_server.py ================================================ import os from concurrent import futures import torch import grpc # from ...constants import GRPC_MAX_MSG_SIZE from ...models import Model from ...utils import ForwardRequest, TokenizeRequest, create_generate_request, print_rank_0 from .pb import generation_pb2, generation_pb2_grpc class GenerationServer(generation_pb2_grpc.GenerationServiceServicer): def __init__(self, model: Model) -> None: self.model = model def _unpack_proto_query_kwargs(self, query_kwargs): query_kwargs = {k: getattr(v, v.WhichOneof("oneof_values")) for k, v in query_kwargs.items()} return query_kwargs def Generate(self, request, context): text = [r for r in request.texts] generate_kwargs = self._unpack_proto_query_kwargs(request.generate_kwargs) request = create_generate_request(text=text, generate_kwargs=generate_kwargs) local_rank = int(os.getenv("LOCAL_RANK", "0")) torch.cuda.set_device(local_rank) self.model.input_device = local_rank response = self.model.generate(request) if isinstance(response, Exception): # if exception occurs, we don't this subprocess to crash response = generation_pb2.GenerationResponseProto( error=str(response), is_encoder_decoder=response.is_encoder_decoder ) else: response = generation_pb2.GenerationResponseProto( texts=response.text, num_generated_tokens=response.num_generated_tokens, is_encoder_decoder=response.is_encoder_decoder, ) return response def Forward(self, request, context): conditioning_text = [r for r in request.conditioning_text] response = [r for r in request.response] request = ForwardRequest(conditioning_text=conditioning_text, response=response) local_rank = int(os.getenv("LOCAL_RANK", "0")) torch.cuda.set_device(local_rank) self.model.input_device = local_rank response = self.model.forward(request) if isinstance(response, Exception): # if exception occurs, we don't this subprocess to crash response = generation_pb2.ForwardResponseProto( error=str(response), is_encoder_decoder=response.is_encoder_decoder ) else: response = generation_pb2.ForwardResponseProto( nll=response.nll, is_encoder_decoder=response.is_encoder_decoder ) return response def serve(inference_pipeline, port): server = grpc.server( futures.ThreadPoolExecutor(max_workers=1), # options=[ # ("grpc.max_send_message_length", GRPC_MAX_MSG_SIZE), # ("grpc.max_receive_message_length", GRPC_MAX_MSG_SIZE), # ], ) generation_pb2_grpc.add_GenerationServiceServicer_to_server(GenerationServer(inference_pipeline), server) server.add_insecure_port(f"[::]:{port}") print_rank_0("About to start server") server.start() print_rank_0("Started") server.wait_for_termination() ================================================ FILE: inference_server/model_handler/grpc_utils/pb/__init__.py ================================================ ================================================ FILE: inference_server/model_handler/grpc_utils/pb/generation_pb2.py ================================================ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: generation.proto """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( b'\n\x10generation.proto\x12\ngeneration"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values"\xc2\x01\n\x16GenerationRequestProto\x12\r\n\x05texts\x18\x01 \x03(\t\x12O\n\x0fgenerate_kwargs\x18\x02 \x03(\x0b\x32\x36.generation.GenerationRequestProto.GenerateKwargsEntry\x1aH\n\x13GenerateKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12 \n\x05value\x18\x02 \x01(\x0b\x32\x11.generation.Value:\x02\x38\x01"q\n\x17GenerationResponseProto\x12\r\n\x05texts\x18\x01 \x03(\t\x12\x1c\n\x14num_generated_tokens\x18\x02 \x03(\x05\x12\r\n\x05\x65rror\x18\x03 \x01(\t\x12\x1a\n\x12is_encoder_decoder\x18\x04 \x01(\x08"B\n\x13\x46orwardRequestProto\x12\x19\n\x11\x63onditioning_text\x18\x01 \x03(\t\x12\x10\n\x08response\x18\x02 \x03(\t"N\n\x14\x46orwardResponseProto\x12\x0b\n\x03nll\x18\x01 \x01(\x02\x12\r\n\x05\x65rror\x18\x02 \x01(\t\x12\x1a\n\x12is_encoder_decoder\x18\x03 \x01(\x08\x32\xba\x01\n\x11GenerationService\x12U\n\x08Generate\x12".generation.GenerationRequestProto\x1a#.generation.GenerationResponseProto"\x00\x12N\n\x07\x46orward\x12\x1f.generation.ForwardRequestProto\x1a .generation.ForwardResponseProto"\x00\x62\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "generation_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._options = None _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_options = b"8\001" _VALUE._serialized_start = 32 _VALUE._serialized_end = 127 _GENERATIONREQUESTPROTO._serialized_start = 130 _GENERATIONREQUESTPROTO._serialized_end = 324 _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_start = 252 _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_end = 324 _GENERATIONRESPONSEPROTO._serialized_start = 326 _GENERATIONRESPONSEPROTO._serialized_end = 439 _FORWARDREQUESTPROTO._serialized_start = 441 _FORWARDREQUESTPROTO._serialized_end = 507 _FORWARDRESPONSEPROTO._serialized_start = 509 _FORWARDRESPONSEPROTO._serialized_end = 587 _GENERATIONSERVICE._serialized_start = 590 _GENERATIONSERVICE._serialized_end = 776 # @@protoc_insertion_point(module_scope) ================================================ FILE: inference_server/model_handler/grpc_utils/pb/generation_pb2_grpc.py ================================================ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" import grpc from . import generation_pb2 as generation__pb2 class GenerationServiceStub(object): """Missing associated documentation comment in .proto file.""" def __init__(self, channel): """Constructor. Args: channel: A grpc.Channel. """ self.Generate = channel.unary_unary( "/generation.GenerationService/Generate", request_serializer=generation__pb2.GenerationRequestProto.SerializeToString, response_deserializer=generation__pb2.GenerationResponseProto.FromString, ) self.Forward = channel.unary_unary( "/generation.GenerationService/Forward", request_serializer=generation__pb2.ForwardRequestProto.SerializeToString, response_deserializer=generation__pb2.ForwardResponseProto.FromString, ) class GenerationServiceServicer(object): """Missing associated documentation comment in .proto file.""" def Generate(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details("Method not implemented!") raise NotImplementedError("Method not implemented!") def Forward(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details("Method not implemented!") raise NotImplementedError("Method not implemented!") def add_GenerationServiceServicer_to_server(servicer, server): rpc_method_handlers = { "Generate": grpc.unary_unary_rpc_method_handler( servicer.Generate, request_deserializer=generation__pb2.GenerationRequestProto.FromString, response_serializer=generation__pb2.GenerationResponseProto.SerializeToString, ), "Forward": grpc.unary_unary_rpc_method_handler( servicer.Forward, request_deserializer=generation__pb2.ForwardRequestProto.FromString, response_serializer=generation__pb2.ForwardResponseProto.SerializeToString, ), } generic_handler = grpc.method_handlers_generic_handler("generation.GenerationService", rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) # This class is part of an EXPERIMENTAL API. class GenerationService(object): """Missing associated documentation comment in .proto file.""" @staticmethod def Generate( request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None, ): return grpc.experimental.unary_unary( request, target, "/generation.GenerationService/Generate", generation__pb2.GenerationRequestProto.SerializeToString, generation__pb2.GenerationResponseProto.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata, ) @staticmethod def Forward( request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None, ): return grpc.experimental.unary_unary( request, target, "/generation.GenerationService/Forward", generation__pb2.ForwardRequestProto.SerializeToString, generation__pb2.ForwardResponseProto.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata, ) ================================================ FILE: inference_server/model_handler/grpc_utils/proto/generation.proto ================================================ syntax = "proto3"; package generation; service GenerationService { rpc Generate (GenerationRequestProto) returns (GenerationResponseProto) {} rpc Forward (ForwardRequestProto) returns (ForwardResponseProto) {} } message Value { oneof oneof_values { string svalue = 1; int64 ivalue = 2; float fvalue = 3; bool bvalue = 4; } } message GenerationRequestProto { repeated string texts = 1; map generate_kwargs = 2; } message GenerationResponseProto { repeated string texts = 1; repeated int32 num_generated_tokens = 2; string error = 3; bool is_encoder_decoder = 4; } message ForwardRequestProto { repeated string conditioning_text = 1; repeated string response = 2; } message ForwardResponseProto { float nll = 1; string error = 2; bool is_encoder_decoder = 3; } ================================================ FILE: inference_server/model_handler/launch.py ================================================ """ Copyright 2022 The Microsoft DeepSpeed Team """ import argparse import torch.distributed as dist from ..models import get_model_class, start_inference_engine from ..utils import get_argument_parser, parse_args from .grpc_utils.generation_server import serve def get_args() -> argparse.Namespace: parser = get_argument_parser() group = parser.add_argument_group(title="launch config") group.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") group.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload for DS ZeRO") group.add_argument("--ports", nargs="+", help="GRPC ports") args = parse_args(parser) return args def main(): args = get_args() start_inference_engine(args.deployment_framework) model = get_model_class(args.deployment_framework)(args) serve(model, args.ports[dist.get_rank()]) if __name__ == "__main__": main() ================================================ FILE: inference_server/models/__init__.py ================================================ from ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE, HF_CPU from .model import Model, get_hf_model_class, load_tokenizer def get_model_class(deployment_framework: str): if deployment_framework == HF_ACCELERATE: from .hf_accelerate import HFAccelerateModel return HFAccelerateModel elif deployment_framework == HF_CPU: from .hf_cpu import HFCPUModel return HFCPUModel elif deployment_framework == DS_INFERENCE: from .ds_inference import DSInferenceModel return DSInferenceModel elif deployment_framework == DS_ZERO: from .ds_zero import DSZeROModel return DSZeROModel else: raise ValueError(f"Unknown deployment framework {deployment_framework}") def start_inference_engine(deployment_framework: str) -> None: if deployment_framework in [DS_INFERENCE, DS_ZERO]: import deepspeed deepspeed.init_distributed("nccl") ================================================ FILE: inference_server/models/ds_inference.py ================================================ import glob import io import json import os from argparse import Namespace from functools import partial import torch import deepspeed from huggingface_hub import try_to_load_from_cache from transformers import AutoConfig from ..utils import get_world_size, run_rank_n from .model import Model, get_hf_model_class # basic DeepSpeed inference model class for benchmarking class DSInferenceModel(Model): def __init__(self, args: Namespace) -> None: super().__init__(args) # create dummy tensors for allocating space which will be filled with # the actual weights while calling deepspeed.init_inference in the # following code with deepspeed.OnDevice(dtype=torch.float16, device="meta"): self.model = get_hf_model_class(args.model_class).from_config( AutoConfig.from_pretrained(args.model_name), torch_dtype=torch.bfloat16 ) self.model = self.model.eval() downloaded_model_path = get_model_path(args.model_name) if args.dtype in [torch.float16, torch.int8]: # We currently support the weights provided by microsoft (which are # pre-sharded) checkpoints_json = os.path.join(downloaded_model_path, "ds_inference_config.json") if os.path.isfile(checkpoints_json): self.model = deepspeed.init_inference( self.model, mp_size=get_world_size(), base_dir=downloaded_model_path, dtype=args.dtype, checkpoint=checkpoints_json, replace_with_kernel_inject=True, ) else: # for bigscience/bloom, sharding is done while loading the model # so this is much slower and for this we need to create a # checkpoints json with TemporaryCheckpointsJSON(downloaded_model_path) as checkpoints_json: self.model = deepspeed.init_inference( self.model, mp_size=get_world_size(), base_dir=downloaded_model_path, dtype=args.dtype, checkpoint=checkpoints_json, replace_with_kernel_inject=True, ) elif args.dtype == torch.bfloat16: # currently ds-inference only supports fp16 CUDA kernels :( raise NotImplementedError("bfloat16 is not yet supported") self.model = self.model.module self.input_device = torch.cuda.current_device() self.post_init(args.model_name) class TemporaryCheckpointsJSON: def __init__(self, model_path: str): self.tmp_directory = "tmp" self.tmp_file = os.path.join(self.tmp_directory, "checkpoints.json") self.model_path = model_path def write_checkpoints_json(self) -> None: print(self.model_path) with io.open(self.tmp_file, "w", encoding="utf-8") as f: data = {"type": "BLOOM", "checkpoints": glob.glob(f"{self.model_path}/*.bin"), "version": 1.0} json.dump(data, f) def __enter__(self): run_rank_n(os.makedirs, barrier=True)(self.tmp_directory, exist_ok=True) run_rank_n(self.write_checkpoints_json, barrier=True)() return self.tmp_file def __exit__(self, type, value, traceback): return def get_model_path(model_name: str): try: config_file = "config.json" # will fall back to HUGGINGFACE_HUB_CACHE config_path = try_to_load_from_cache(model_name, config_file, cache_dir=os.getenv("TRANSFORMERS_CACHE")) if config_path is None: # treat the model name as an explicit model path return model_name else: return os.path.dirname(config_path) except: # treat the model name as an explicit model path return model_name ================================================ FILE: inference_server/models/ds_zero.py ================================================ from argparse import Namespace import torch import deepspeed from transformers import AutoConfig from transformers.deepspeed import HfDeepSpeedConfig from ..utils import get_world_size from .model import Model, get_hf_model_class class DSZeROModel(Model): def __init__(self, args: Namespace) -> None: super().__init__(args) config = AutoConfig.from_pretrained(args.model_name) train_micro_batch_size_per_gpu = 1 train_batch_size = train_micro_batch_size_per_gpu * get_world_size() # try playing with these parameters, might improve throughput for you # hardware setup ds_config = { "fp16": { "enabled": args.dtype == torch.float16, }, "bf16": { "enabled": args.dtype == torch.bfloat16, }, "zero_optimization": { "stage": 3, "overlap_comm": True, "contiguous_gradients": True, "reduce_bucket_size": config.hidden_size * config.hidden_size, "stage3_prefetch_bucket_size": 0.9 * config.hidden_size * config.hidden_size, "stage3_param_persistence_threshold": 0, }, "steps_per_print": 2000, "train_batch_size": train_batch_size, "train_micro_batch_size_per_gpu": train_micro_batch_size_per_gpu, "wall_clock_breakdown": False, } if args.cpu_offload: ds_config["zero_optimization"]["offload_param"] = {"device": "cpu", "pin_memory": True} # this tells from_pretrained to instantiate directly on gpus dschf = HfDeepSpeedConfig(ds_config) self.model = get_hf_model_class(args.model_class).from_pretrained(args.model_name, torch_dtype=args.dtype) self.model = self.model.eval() # convert model to a fully sharded model using ZeRO self.model = deepspeed.initialize(model=self.model, config_params=ds_config)[0] self.model.module.eval() self.model = self.model.module # this is the CUDA device for the current process. This will be used # later to identify the GPU on which to transfer tensors self.input_device = torch.cuda.current_device() self.post_init(args.model_name) ================================================ FILE: inference_server/models/hf_accelerate.py ================================================ from argparse import Namespace import torch from ..utils import get_world_size from .model import Model, get_hf_model_class class HFAccelerateModel(Model): def __init__(self, args: Namespace) -> None: super().__init__(args) kwargs = {"pretrained_model_name_or_path": args.model_name, "device_map": "auto"} if get_world_size() > 1: kwargs["device_map"] = "balanced_low_0" if args.dtype == torch.int8: # using LLM.int8() kwargs["load_in_8bit"] = True else: kwargs["torch_dtype"] = args.dtype # this is the CUDA device for the current process. This will be used # later to identify the GPU on which to transfer tensors self.model = get_hf_model_class(args.model_class).from_pretrained(**kwargs) self.model.requires_grad_(False) self.model.eval() self.input_device = "cuda:0" self.post_init(args.model_name) ================================================ FILE: inference_server/models/hf_cpu.py ================================================ from argparse import Namespace from .hf_accelerate import HFAccelerateModel class HFCPUModel(HFAccelerateModel): def __init__(self, args: Namespace) -> None: super().__init__(args) self.input_device = "cpu" ================================================ FILE: inference_server/models/model.py ================================================ import argparse import copy from typing import List, Union import torch import transformers from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig from ..utils import ( ForwardRequest, ForwardResponse, GenerateRequest, GenerateResponse, TokenizeRequest, TokenizeResponse, ) class Model: def __init__(self, args: argparse.Namespace) -> None: self.model = None self.input_device = None self.max_input_length = args.max_input_length self.max_batch_size = args.max_batch_size def post_init(self, model_name: str) -> None: self.is_encoder_decoder = AutoConfig.from_pretrained(model_name).is_encoder_decoder self.generation_config = GenerationConfig.from_model_config(AutoConfig.from_pretrained(model_name)) self.tokenizer = load_tokenizer(model_name) self.pad = self.tokenizer.pad_token_id self.prefix_token_id = self.tokenizer("A")["input_ids"][0] def get_generation_config(self, request: GenerateRequest) -> GenerationConfig: generation_config = copy.deepcopy(self.generation_config) request = dict(request) request_filtered = {} for key, value in request.items(): if value is not None and key not in ["text", "remove_input_from_output"]: request_filtered[key] = value request_filtered["return_dict_in_generate"] = True generation_config.update(**request_filtered) return generation_config def generate(self, request: GenerateRequest) -> Union[GenerateResponse, Exception]: try: batch_size = len(request.text) check_batch_size(batch_size, self.max_batch_size) input_tokens = self.tokenizer(request.text, return_tensors="pt", padding=True) max_input_length_in_batch = input_tokens.input_ids[0].shape[0] check_max_input_length(max_input_length_in_batch, self.max_input_length) for t in input_tokens: if torch.is_tensor(input_tokens[t]): input_tokens[t] = input_tokens[t].to(self.input_device) num_input_tokens = input_tokens["input_ids"].shape[1] generation_config = self.get_generation_config(request) output = self.model.generate(**input_tokens, generation_config=generation_config) output_tokens = output.sequences if self.is_encoder_decoder: num_generated_tokens = (output_tokens != self.pad).sum(dim=-1).tolist() generated_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True) else: generated_tokens = output_tokens[:, num_input_tokens:] num_generated_tokens = (generated_tokens != self.pad).sum(dim=-1).tolist() if request.remove_input_from_output: # create the dummy prefix for detokenization prefix_to_add = torch.tensor([[self.prefix_token_id]] * batch_size).to(self.input_device) # the generate method's output includes input too. Remove input if # that is requested by the user generated_tokens = torch.cat([prefix_to_add, generated_tokens], dim=1) generated_text = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) generated_text = [i[1:] for i in generated_text] else: generated_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True) return GenerateResponse( text=generated_text, num_generated_tokens=num_generated_tokens, is_encoder_decoder=self.is_encoder_decoder, ) except Exception as exception: return exception def forward(self, request: ForwardRequest) -> Union[ForwardResponse, Exception]: def prepare_tensors(conditioning_tokens: List[List[int]], response_tokens: List[List[int]]): bs = len(conditioning_tokens) input_ids = [conditioning_tokens[i] + response_tokens[i] for i in range(bs)] attention_mask = [[1] * (len(conditioning_tokens[i]) + len(response_tokens[i])) for i in range(bs)] labels = [[-100] * len(conditioning_tokens[i]) + response_tokens[i] for i in range(bs)] input_ids = pad(input_ids, self.tokenizer.pad_token_id) attention_mask = pad(attention_mask, 0) labels = pad(labels, -100) return { "input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "labels": torch.tensor(labels), } def pad(arrays: list, padding: int, max_length: int = None): if max_length is None: max_length = max(list(map(len, arrays))) arrays = [[padding] * (max_length - len(array)) + array for array in arrays] return arrays try: batch_size = len(request.conditioning_text) check_batch_size(batch_size, self.max_batch_size) conditioning_tokens = self.tokenizer(request.conditioning_text)["input_ids"] response_tokens = self.tokenizer(request.response)["input_ids"] max_length_in_batch = max([len(conditioning_tokens) + len(response_tokens)]) check_max_input_length(max_length_in_batch, self.max_input_length) input_tokens = prepare_tensors(conditioning_tokens, response_tokens) for t in input_tokens: if torch.is_tensor(input_tokens[t]): input_tokens[t] = input_tokens[t].to(self.input_device) loss = self.model(**input_tokens).loss return ForwardResponse(nll=loss.item(), is_encoder_decoder=self.is_encoder_decoder) except Exception as exception: return exception def tokenize(self, request: TokenizeRequest) -> TokenizeResponse: return TokenizeResponse( token_ids=self.tokenizer(request.text).input_ids, is_encoder_decoder=self.is_encoder_decoder, ) def check_max_input_length(input_token_length: int, max_input_length: int) -> None: if max_input_length is None: return if input_token_length > max_input_length: raise Exception(f"max supported input length = {max_input_length} for now") def check_batch_size(batch_size: int, max_batch_size: int) -> None: if max_batch_size is None: return if batch_size > max_batch_size: raise Exception(f"max supported batch size = {max_batch_size} for now") # this is a hack for now def get_hf_model_class(model_class: str) -> Union[AutoModelForCausalLM, AutoModelForSeq2SeqLM]: return getattr(transformers, model_class) def load_tokenizer(model_name: str) -> AutoTokenizer: tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") if tokenizer.pad_token_id is None: tokenizer.add_special_tokens({"pad_token": "[PAD]"}) return tokenizer ================================================ FILE: inference_server/server.py ================================================ import os from functools import partial from flask import Flask, request from flask_api import status from pydantic import BaseModel from .constants import HF_ACCELERATE from .model_handler.deployment import ModelDeployment from .utils import ( ForwardRequest, GenerateRequest, TokenizeRequest, get_exception_response, get_num_tokens_to_generate, get_torch_dtype, parse_bool, run_and_log_time, ) class QueryID(BaseModel): generate_query_id: int = 0 tokenize_query_id: int = 0 forward_query_id: int = 0 # placeholder class for getting args. gunicorn does not allow passing args to a # python script via ArgumentParser class Args: def __init__(self) -> None: self.deployment_framework = os.getenv("DEPLOYMENT_FRAMEWORK", HF_ACCELERATE) self.model_name = os.getenv("MODEL_NAME") self.model_class = os.getenv("MODEL_CLASS") self.dtype = get_torch_dtype(os.getenv("DTYPE")) self.allowed_max_new_tokens = int(os.getenv("ALLOWED_MAX_NEW_TOKENS", 100)) self.max_input_length = int(os.getenv("MAX_INPUT_LENGTH", 512)) self.max_batch_size = int(os.getenv("MAX_BATCH_SIZE", 4)) self.debug = parse_bool(os.getenv("DEBUG", "false")) # ------------------------------------------------------ args = Args() model = ModelDeployment(args, True) query_ids = QueryID() app = Flask(__name__) # ------------------------------------------------------ @app.route("/query_id/", methods=["GET"]) def query_id(): return query_ids.dict(), status.HTTP_200_OK @app.route("/tokenize/", methods=["POST"]) def tokenize(): try: x = request.get_json() x = TokenizeRequest(**x) response, total_time_taken = run_and_log_time(partial(model.tokenize, request=x)) response.query_id = query_ids.tokenize_query_id query_ids.tokenize_query_id += 1 response.total_time_taken = "{:.2f} msecs".format(total_time_taken * 1000) return response.dict(), status.HTTP_200_OK except Exception: response = get_exception_response(query_ids.tokenize_query_id, args.debug) query_ids.tokenize_query_id += 1 return response, status.HTTP_500_INTERNAL_SERVER_ERROR @app.route("/generate/", methods=["POST"]) def generate(): try: x = request.get_json() x = GenerateRequest(**x) x.max_new_tokens = get_num_tokens_to_generate(x.max_new_tokens, args.allowed_max_new_tokens) response, total_time_taken = run_and_log_time(partial(model.generate, request=x)) response.query_id = query_ids.generate_query_id query_ids.generate_query_id += 1 response.total_time_taken = "{:.2f} secs".format(total_time_taken) return response.dict(), status.HTTP_200_OK except Exception: response = get_exception_response(query_ids.generate_query_id, args.debug) query_ids.generate_query_id += 1 return response, status.HTTP_500_INTERNAL_SERVER_ERROR @app.route("/forward/", methods=["POST"]) def forward(): try: x = request.get_json() x = ForwardRequest(**x) if len(x.conditioning_text) != len(x.response): raise Exception("unequal number of elements in conditioning_text and response arguments") response, total_time_taken = run_and_log_time(partial(model.forward, request=x)) response.query_id = query_ids.forward_query_id query_ids.forward_query_id += 1 response.total_time_taken = "{:.2f} secs".format(total_time_taken) return response.dict(), status.HTTP_200_OK except Exception: response = get_exception_response(query_ids.forward_query_id, args.debug) query_ids.forward_query_id += 1 return response, status.HTTP_500_INTERNAL_SERVER_ERROR ================================================ FILE: inference_server/utils/__init__.py ================================================ from .requests import ( ForwardRequest, ForwardResponse, GenerateRequest, GenerateResponse, TokenizeRequest, TokenizeResponse, create_generate_request, get_filter_dict, parse_bool, ) from .utils import ( get_argument_parser, get_cuda_visible_devices, get_dummy_batch, get_exception_response, get_num_tokens_to_generate, get_str_dtype, get_torch_dtype, get_world_size, pad_ids, parse_args, print_rank_0, run_and_log_time, run_rank_n, ) ================================================ FILE: inference_server/utils/requests.py ================================================ from typing import Any, List from pydantic import BaseModel class BaseResponse(BaseModel): query_id: int = None total_time_taken: str = None class GenerateRequest(BaseModel): text: List[str] = None min_length: int = None do_sample: bool = None early_stopping: bool = None temperature: float = None top_k: int = None top_p: float = None typical_p: float = None repetition_penalty: float = None bos_token_id: int = None pad_token_id: int = None eos_token_id: int = None length_penalty: float = None no_repeat_ngram_size: int = None encoder_no_repeat_ngram_size: int = None max_time: float = None max_new_tokens: int = None decoder_start_token_id: int = None diversity_penalty: float = None forced_bos_token_id: int = None forced_eos_token_id: int = None exponential_decay_length_penalty: float = None remove_input_from_output: bool = True def get_generate_kwargs(self) -> dict: x = {} for k, v in self.dict().items(): if k not in ["text", "method"] and v is not None: x[k] = v return x class GenerateResponse(BaseResponse): text: List[str] = None num_generated_tokens: List[int] = None is_encoder_decoder: bool = False class TokenizeRequest(BaseModel): text: List[str] = None class TokenizeResponse(BaseResponse): token_ids: List[List[int]] = None is_encoder_decoder: bool = False class ForwardRequest(BaseModel): conditioning_text: List[str] = None response: List[str] = None class ForwardResponse(BaseResponse): nll: float = None is_encoder_decoder: bool = False def parse_bool(value: str) -> bool: if value.lower() == "true": return True elif value.lower() == "false": return False else: raise ValueError("{} is not a valid boolean value".format(value)) def parse_field(kwargs: dict, field: str, dtype: type, default_value: Any = None) -> Any: if field in kwargs: if type(kwargs[field]) == dtype: return kwargs[field] elif dtype == bool: return parse_bool(kwargs[field]) else: return dtype(kwargs[field]) else: return default_value def create_generate_request(text: List[str], generate_kwargs: dict) -> GenerateRequest: # get user generate_kwargs as json and parse it return GenerateRequest( text=text, min_length=parse_field(generate_kwargs, "min_length", int), do_sample=parse_field(generate_kwargs, "do_sample", bool), early_stopping=parse_field(generate_kwargs, "early_stopping", bool), temperature=parse_field(generate_kwargs, "temperature", float), top_k=parse_field(generate_kwargs, "top_k", int), top_p=parse_field(generate_kwargs, "top_p", float), typical_p=parse_field(generate_kwargs, "typical_p", float), repetition_penalty=parse_field(generate_kwargs, "repetition_penalty", float), bos_token_id=parse_field(generate_kwargs, "bos_token_id", int), pad_token_id=parse_field(generate_kwargs, "pad_token_id", int), eos_token_id=parse_field(generate_kwargs, "eos_token_id", int), length_penalty=parse_field(generate_kwargs, "length_penalty", float), no_repeat_ngram_size=parse_field(generate_kwargs, "no_repeat_ngram_size", int), encoder_no_repeat_ngram_size=parse_field(generate_kwargs, "encoder_no_repeat_ngram_size", int), max_time=parse_field(generate_kwargs, "max_time", float), max_new_tokens=parse_field(generate_kwargs, "max_new_tokens", int), decoder_start_token_id=parse_field(generate_kwargs, "decoder_start_token_id", int), diversity_penalty=parse_field(generate_kwargs, "diversity_penalty", float), forced_bos_token_id=parse_field(generate_kwargs, "forced_bos_token_id", int), forced_eos_token_id=parse_field(generate_kwargs, "forced_eos_token_id", int), exponential_decay_length_penalty=parse_field(generate_kwargs, "exponential_decay_length_penalty", float), remove_input_from_output=parse_field(generate_kwargs, "remove_input_from_output", bool, True), ) def get_filter_dict(d: BaseModel) -> dict: d = dict(d) q = {} for i in d: if d[i] != None: q[i] = d[i] del q["text"] return q ================================================ FILE: inference_server/utils/utils.py ================================================ import argparse import copy import json import math import os import sys import time import traceback from functools import partial from typing import Any, Callable, List, Tuple, Union import torch import torch.distributed as dist from ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE, HF_CPU # used for benchmarks dummy_input_sentences = [ "DeepSpeed is a machine learning framework", "He is working on", "He has a", "He got all", "Everyone is happy and I can", "The new movie that got Oscar this year", "In the far far distance from our galaxy,", "Peace is the only way", ] def get_argument_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() group = parser.add_argument_group(title="model") group.add_argument( "--deployment_framework", type=str, choices=[HF_ACCELERATE, DS_INFERENCE, DS_ZERO, HF_CPU], default=HF_ACCELERATE, ) group.add_argument( "--model_name", type=str, required=True, help="model name to use", ) group.add_argument( "--model_class", type=str, required=True, help="model class to use", ) group.add_argument( "--dtype", type=str, required=True, choices=["bf16", "fp16", "int8", "fp32"], help="dtype for model" ) group.add_argument( "--generate_kwargs", type=str, default='{"min_length": 100, "max_new_tokens": 100, "do_sample": false}', help="generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters", ) group.add_argument("--max_input_length", type=int, help="max input length") group.add_argument("--max_batch_size", type=int, help="max supported batch size") return parser def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: args = parser.parse_args() args.dtype = get_torch_dtype(args.dtype) args.generate_kwargs = json.loads(args.generate_kwargs) return args def run_rank_n(func: Callable, rank: int = 0, barrier: bool = False) -> None: # wrapper function for the rank to execute on def func_rank_n(*args, **kwargs): output = func(*args, **kwargs) if barrier: dist.barrier() return output # a dummy method that doesn't do anything def func_rank_other(*args, **kwargs): if barrier: dist.barrier() if dist.is_initialized(): if dist.get_rank() == rank: return func_rank_n return func_rank_other else: return func @run_rank_n def print_rank_0(*args, **kwargs) -> None: print(*args, **kwargs) def get_torch_dtype(dtype_str: str) -> torch.dtype: if dtype_str == "bf16": return torch.bfloat16 elif dtype_str == "fp16": return torch.float16 elif dtype_str == "int8": return torch.int8 elif dtype_str == "fp32": return torch.float32 def get_str_dtype(dtype_str: torch.dtype) -> str: if dtype_str == torch.bfloat16: return "bf16" elif dtype_str == torch.float16: return "fp16" elif dtype_str == torch.int8: return "int8" elif dtype_str == torch.float32: return "fp32" def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]: if input_sentences is None: input_sentences = copy.deepcopy(dummy_input_sentences) if batch_size > len(input_sentences): input_sentences *= math.ceil(batch_size / len(input_sentences)) input_sentences = input_sentences[:batch_size] return input_sentences def get_num_tokens_to_generate(max_new_tokens: int, allowed_max_new_tokens: int) -> int: if max_new_tokens is None: return allowed_max_new_tokens else: return min(max_new_tokens, allowed_max_new_tokens) def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]: # runs a function / list of functions and times them start_time = time.time() if type(execs) == list: results = [] for f in execs: results.append(f()) else: results = execs() time_elapsed = time.time() - start_time return results, time_elapsed def pad_ids(arrays, padding, max_length=-1): # does left padding if max_length < 0: max_length = max(list(map(len, arrays))) arrays = [[padding] * (max_length - len(array)) + array for array in arrays] return arrays def get_exception_response(query_id: int, debug: bool = False): e_type, e_message, e_stack_trace = sys.exc_info() response = {"error": str(e_type.__name__), "message": str(e_message), "query_id": query_id} if debug: trace_back = traceback.extract_tb(e_stack_trace) # Format stacktrace stack_trace = [] for trace in trace_back: stack_trace.append( "File : {}, Line : {}, Func.Name : {}, Message : {}".format(trace[0], trace[1], trace[2], trace[3]) ) response["stack_trace"] = stack_trace return response def get_world_size() -> int: if dist.is_initialized(): return dist.get_world_size() else: cuda_visible_devices = get_cuda_visible_devices() if cuda_visible_devices is None: return 0 return len(cuda_visible_devices) def get_cuda_visible_devices() -> List[int]: cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_visible_devices is not None: cuda_visible_devices = list(map(int, cuda_visible_devices.split(","))) return cuda_visible_devices ================================================ FILE: server_request.py ================================================ import argparse import requests def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() group = parser.add_argument_group(title="launch config") group.add_argument("--host", type=str, required=True, help="host address") group.add_argument("--port", type=int, required=True, help="port number") return parser.parse_args() def generate(url: str) -> None: url = url + "/generate/" request_body = { "text": [ "DeepSpeed", "DeepSpeed is a", "DeepSpeed is a machine", "DeepSpeed is a machine learning framework", ], "max_new_tokens": 40, } response = requests.post(url=url, json=request_body, verify=False) print(response.json(), "\n") def tokenize(url: str) -> None: url = url + "/tokenize/" request_body = {"text": ["DeepSpeed is a", "DeepSpeed is a machine learning framework"]} response = requests.post(url=url, json=request_body, verify=False) print(response.json(), "\n") def forward(url: str) -> None: url = url + "/forward/" request_body = { "conditioning_text": [ "DeepSpeed", "DeepSpeed is a", "DeepSpeed is a machine", "DeepSpeed is a machine learning framework", ], "response": [ "DeepSpeed", "DeepSpeed is a", "DeepSpeed is a machine", "DeepSpeed is a machine learning framework", ], } response = requests.post(url=url, json=request_body, verify=False) print(response.json(), "\n") def query_id(url: str) -> None: url = url + "/query_id/" response = requests.get(url=url, verify=False) print(response.json(), "\n") def main(): args = get_args() url = "http://{}:{}".format(args.host, args.port) generate(url) tokenize(url) forward(url) query_id(url) if __name__ == "__main__": main() ================================================ FILE: setup.cfg ================================================ [isort] default_section = FIRSTPARTY ensure_newline_before_comments = True force_grid_wrap = 0 include_trailing_comma = True known_first_party = transformers known_third_party = absl conllu datasets elasticsearch fairseq faiss-cpu fastprogress fire fugashi git h5py matplotlib nltk numpy packaging pandas PIL psutil pytest pytorch_lightning rouge_score sacrebleu seqeval sklearn streamlit tensorboardX tensorflow tensorflow_datasets timeout_decorator torch torchaudio torchtext torchvision torch_xla tqdm line_length = 119 lines_after_imports = 2 multi_line_output = 3 use_parentheses = True ================================================ FILE: static/css/style.css ================================================ #left-column { width: 80%; } #right-column { width: 18%; float: right; padding-right: 10px; } body { background-color: lightgray; height: auto; } #text-input { width: 100%; float: left; resize: none; } .slider { width: 100%; float: left; } #log-output { width: 100%; float: left; resize: none; } #max-new-tokens-input { width: 30%; float: left; margin-left: 5px; } ================================================ FILE: static/js/index.js ================================================ const textGenInput = document.getElementById('text-input'); const clickButton = document.getElementById('submit-button'); const temperatureSlider = document.getElementById('temperature-slider'); const temperatureTextBox = document.getElementById('temperature-textbox') const top_pSlider = document.getElementById('top_p-slider'); const top_pTextBox = document.getElementById('top_p-textbox'); const top_kSlider = document.getElementById('top_k-slider'); const top_kTextBox = document.getElementById('top_k-textbox'); const repetition_penaltySlider = document.getElementById('repetition_penalty-slider'); const repetition_penaltyTextBox = document.getElementById('repetition_penalty-textbox'); const max_new_tokensInput = document.getElementById('max-new-tokens-input'); const textLogOutput = document.getElementById('log-output'); function get_temperature() { return parseFloat(temperatureSlider.value); } temperatureSlider.addEventListener('input', async (event) => { temperatureTextBox.innerHTML = "temperature = " + get_temperature(); }); function get_top_p() { return parseFloat(top_pSlider.value); } top_pSlider.addEventListener('input', async (event) => { top_pTextBox.innerHTML = "top_p = " + get_top_p(); }); function get_top_k() { return parseInt(top_kSlider.value); } top_kSlider.addEventListener('input', async (event) => { top_kTextBox.innerHTML = "top_k = " + get_top_k(); }); function get_repetition_penalty() { return parseFloat(repetition_penaltySlider.value); } repetition_penaltySlider.addEventListener('input', async (event) => { repetition_penaltyTextBox.innerHTML = "repetition_penalty = " + get_repetition_penalty(); }); function get_max_new_tokens() { return parseInt(max_new_tokensInput.value); } clickButton.addEventListener('click', async (event) => { clickButton.textContent = 'Processing' clickButton.disabled = true; var jsonPayload = { text: [textGenInput.value], temperature: get_temperature(), top_k: get_top_k(), top_p: get_top_p(), max_new_tokens: get_max_new_tokens(), repetition_penalty: get_repetition_penalty(), do_sample: true, remove_input_from_output: true }; if (jsonPayload.temperature == 0) { jsonPayload.do_sample = false; } console.log(jsonPayload); $.ajax({ url: '/generate/', type: 'POST', contentType: "application/json; charset=utf-8", data: JSON.stringify(jsonPayload), headers: { 'Access-Control-Allow-Origin': '*' }, success: function (response) { var input_text = textGenInput.value; if ("text" in response) { if (response.is_encoder_decoder) { textLogOutput.value = response.text[0] + '\n\n'; } else { textGenInput.value = input_text + response.text[0]; textLogOutput.value = ''; } textLogOutput.value += 'total_time_taken = ' + response.total_time_taken + "\n"; textLogOutput.value += 'num_generated_tokens = ' + response.num_generated_tokens + "\n"; textLogOutput.style.backgroundColor = "lightblue"; } else { textLogOutput.value = 'total_time_taken = ' + response.total_time_taken + "\n"; textLogOutput.value += 'error: ' + response.message; textLogOutput.style.backgroundColor = "#D65235"; } clickButton.textContent = 'Submit'; clickButton.disabled = false; }, error: function (error) { console.log(JSON.stringify(error, null, 2)); clickButton.textContent = 'Submit' clickButton.disabled = false; } }); }); ================================================ FILE: templates/index.html ================================================ Large Models Playground
temperature = 1
top_k = 50
top_p = 1
max_new_tokens =
repetition_penalty = 1
================================================ FILE: ui.py ================================================ import argparse import requests from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse, JSONResponse from fastapi.routing import APIRoute, Mount from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates from transformers import AutoTokenizer from uvicorn import run def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() group = parser.add_argument_group(title="launch config") group.add_argument("--ui_host", type=str, default="127.0.0.1", help="host address for UI") group.add_argument("--ui_port", type=int, default=5001, help="port number for UI") group.add_argument( "--generation_backend_host", type=str, default="127.0.0.1", help="host address for generation server" ) group.add_argument("--generation_backend_port", type=int, default=5000, help="port number for generation server") return parser.parse_args() class Server: def __init__(self, args: argparse.Namespace): self.templates = Jinja2Templates(directory="templates") self.ui_host = args.ui_host self.ui_port = args.ui_port self.generation_backend_host = args.generation_backend_host self.generation_backend_port = args.generation_backend_port self.workers = 1 self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom") self.app = FastAPI( routes=[ APIRoute("/", self.homepage, methods=["GET"], response_class=HTMLResponse), APIRoute("/generate/", self.generate, methods=["POST"]), Mount("/static/", StaticFiles(directory="static"), name="static"), ], timeout=600, ) self.prefix_checkpoints_list = None def homepage(self, request: Request) -> HTMLResponse: return self.templates.TemplateResponse("index.html", {"request": request}) def generate(self, request: dict) -> JSONResponse: response = requests.post( f"http://{self.generation_backend_host}:{self.generation_backend_port}/generate", json=request, verify=False, ) return JSONResponse(content=response.json()) def run(self): # get around CORS self.app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) run(self.app, host=self.ui_host, port=self.ui_port, workers=self.workers) def main() -> None: Server(get_args()).run() if __name__ == "__main__": main()