Repository: huggingface/transformers-bloom-inference
Branch: main
Commit: 62698bf4b75a
Files: 39
Total size: 117.3 KB

Directory structure:
gitextract_c87qb2fj/

├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── bloom-inference-scripts/
│   ├── README.md
│   ├── bloom-accelerate-inference.py
│   ├── bloom-ds-inference.py
│   └── bloom-ds-zero-inference.py
├── inference_server/
│   ├── benchmark.py
│   ├── cli.py
│   ├── constants.py
│   ├── download_model.py
│   ├── model_handler/
│   │   ├── __init__.py
│   │   ├── deployment.py
│   │   ├── grpc_utils/
│   │   │   ├── __init__.py
│   │   │   ├── generation_server.py
│   │   │   ├── pb/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── generation_pb2.py
│   │   │   │   └── generation_pb2_grpc.py
│   │   │   └── proto/
│   │   │       └── generation.proto
│   │   └── launch.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── ds_inference.py
│   │   ├── ds_zero.py
│   │   ├── hf_accelerate.py
│   │   ├── hf_cpu.py
│   │   └── model.py
│   ├── server.py
│   └── utils/
│       ├── __init__.py
│       ├── requests.py
│       └── utils.py
├── server_request.py
├── setup.cfg
├── static/
│   ├── css/
│   │   └── style.css
│   └── js/
│       └── index.js
├── templates/
│   └── index.html
└── ui.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
__pycache__/


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
  - repo: https://github.com/pycqa/isort
    rev: 5.12.0
    hooks:
      - id: isort
        name: isort (python)
  - repo: https://github.com/psf/black
    rev: 23.1.0
    hooks:
      - id: black
        args: [--line-length=119,--target-version=py35]


================================================
FILE: Dockerfile
================================================
FROM nvidia/cuda:11.6.1-devel-ubi8 as base

RUN dnf install -y --disableplugin=subscription-manager make git && dnf clean all --disableplugin=subscription-manager

# taken form pytorch's dockerfile
RUN curl -L -o ./miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
    chmod +x ./miniconda.sh && \
    ./miniconda.sh -b -p /opt/conda && \
    rm ./miniconda.sh

ENV PYTHON_VERSION=3.9 \
    PATH=/opt/conda/envs/inference/bin:/opt/conda/bin:${PATH}

# create conda env
RUN conda create -n inference python=${PYTHON_VERSION} pip -y

# change shell to activate env
SHELL ["conda", "run", "-n", "inference", "/bin/bash", "-c"]

FROM base as conda

# update conda
RUN conda update -n base -c defaults conda -y
# cmake
RUN conda install -c anaconda cmake -y

# necessary stuff
RUN pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 \
    transformers==4.26.1 \
    deepspeed==0.7.6 \
    accelerate==0.16.0 \
    gunicorn==20.1.0 \
    flask \
    flask_api \
    fastapi==0.89.1 \
    uvicorn==0.19.0 \
    jinja2==3.1.2 \
    pydantic==1.10.2 \
    huggingface_hub==0.12.1 \
	grpcio-tools==1.50.0 \
    --no-cache-dir

# clean conda env
RUN conda clean -ya

# change this as you like 🤗
ENV TRANSFORMERS_CACHE=/cos/HF_cache \
    HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE}

FROM conda as app

WORKDIR /src
RUN chmod -R g+w /src

RUN mkdir /.cache && \
    chmod -R g+w /.cache

ENV PORT=5000 \
    UI_PORT=5001
EXPOSE ${PORT}
EXPOSE ${UI_PORT}

CMD git clone https://github.com/huggingface/transformers-bloom-inference.git && \
    cd transformers-bloom-inference && \
    # install grpc and compile protos
    make gen-proto && \
    make bloom-560m


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
gen-proto:
	mkdir -p inference_server/model_handler/grpc_utils/pb

	python -m grpc_tools.protoc -Iinference_server/model_handler/grpc_utils/proto --python_out=inference_server/model_handler/grpc_utils/pb --grpc_python_out=inference_server/model_handler/grpc_utils/pb inference_server/model_handler/grpc_utils/proto/generation.proto

	find inference_server/model_handler/grpc_utils/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;

	touch inference_server/model_handler/grpc_utils/__init__.py
	touch inference_server/model_handler/grpc_utils/pb/__init__.py

	rm -rf inference_server/model_handler/grpc_utils/pb/*.py-e

ui:
	python -m ui --ui_host 127.0.0.1 --ui_port 5001 --generation_backend_host 127.0.0.1 --generation_backend_port 5000 &

# ------------------------- DS inference -------------------------
bloom-176b:
	make ui

	TOKENIZERS_PARALLELISM=false \
	MODEL_NAME=bigscience/bloom \
	MODEL_CLASS=AutoModelForCausalLM \
	DEPLOYMENT_FRAMEWORK=ds_inference \
	DTYPE=fp16 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=4 \
	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

# loads faster than the above one
microsoft-bloom-176b:
	make ui

	TOKENIZERS_PARALLELISM=false \
	MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \
	MODEL_CLASS=AutoModelForCausalLM \
	DEPLOYMENT_FRAMEWORK=ds_inference \
	DTYPE=fp16 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=4 \
	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

bloomz-176b:
	make ui

	TOKENIZERS_PARALLELISM=false \
	MODEL_NAME=bigscience/bloomz \
	MODEL_CLASS=AutoModelForCausalLM \
	DEPLOYMENT_FRAMEWORK=ds_inference \
	DTYPE=fp16 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=4 \
	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

bloom-176b-int8:
	make ui

	TOKENIZERS_PARALLELISM=false \
	MODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \
	MODEL_CLASS=AutoModelForCausalLM \
	DEPLOYMENT_FRAMEWORK=ds_inference \
	DTYPE=int8 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=4 \
	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

# ------------------------- HF accelerate -------------------------
bloom-560m:
	make ui

	TOKENIZERS_PARALLELISM=false \
	MODEL_NAME=bigscience/bloom-560m \
	MODEL_CLASS=AutoModelForCausalLM \
	DEPLOYMENT_FRAMEWORK=hf_accelerate \
	DTYPE=bf16 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=32 \
	CUDA_VISIBLE_DEVICES=0 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

flan-t5-xxl:
	make ui

	TOKENIZERS_PARALLELISM=false \
	MODEL_NAME=google/flan-t5-xxl \
	MODEL_CLASS=AutoModelForSeq2SeqLM \
	DEPLOYMENT_FRAMEWORK=hf_accelerate \
	DTYPE=bf16 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=4 \
	CUDA_VISIBLE_DEVICES=0 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

ul2:
	make ui

	TOKENIZERS_PARALLELISM=false \
	MODEL_NAME=google/ul2 \
	MODEL_CLASS=AutoModelForSeq2SeqLM \
	DEPLOYMENT_FRAMEWORK=hf_accelerate \
	DTYPE=bf16 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=4 \
	CUDA_VISIBLE_DEVICES=0 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

codegen-mono:
	make ui

	TOKENIZERS_PARALLELISM=false \
	MODEL_NAME=Salesforce/codegen-16B-mono \
	MODEL_CLASS=AutoModelForCausalLM \
	DEPLOYMENT_FRAMEWORK=hf_accelerate \
	DTYPE=bf16 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=4 \
	CUDA_VISIBLE_DEVICES=0 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

# ------------------------- HF CPU -------------------------
bloom-560m-cpu:
	make ui

	MODEL_NAME=bigscience/bloom-560m \
	MODEL_CLASS=AutoModelForCausalLM \
	DEPLOYMENT_FRAMEWORK=hf_cpu \
	DTYPE=fp32 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=32 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'

flan-t5-base-cpu:
	make ui

	MODEL_NAME=google/flan-t5-base \
	MODEL_CLASS=AutoModelForSeq2SeqLM \
	DEPLOYMENT_FRAMEWORK=hf_cpu \
	DTYPE=bf16 \
	MAX_INPUT_LENGTH=2048 \
	MAX_BATCH_SIZE=32 \
	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'


================================================
FILE: README.md
================================================
> [!NOTE]  
> This repository has been archived and is not being maintained any longer since a lot more efficient serving frameworks have been released recently like vLLM and TGI.

# Fast Inference Solutions for BLOOM

This repo provides demos and packages to perform fast inference solutions for BLOOM. Some of the solutions have their own repos in which case a link to the [corresponding repos](#Other-inference-solutions) is provided instead.


# Inference solutions for BLOOM 176B

We support HuggingFace accelerate and DeepSpeed Inference for generation.

Install required packages:

```shell
pip install flask flask_api gunicorn pydantic accelerate huggingface_hub>=0.9.0 deepspeed>=0.7.3 deepspeed-mii==0.0.2
```

alternatively you can also install deepspeed from source:
```shell
git clone https://github.com/microsoft/DeepSpeed
cd DeepSpeed
CFLAGS="-I$CONDA_PREFIX/include/" LDFLAGS="-L$CONDA_PREFIX/lib/" TORCH_CUDA_ARCH_LIST="7.0" DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
```

All the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B (fp16/bf16) and 4 A100 80GB GPUs for BLOOM 176B (int8). These scripts might not work for other models or a different number of GPUs.

DS inference is deployed using logic borrowed from DeepSpeed MII library.

Note: Sometimes GPU memory is not freed when DS inference deployment crashes. You can free this memory by running `killall python` in terminal.

For using BLOOM quantized, use dtype = int8. Also, change the model_name to microsoft/bloom-deepspeed-inference-int8 for DeepSpeed-Inference. For HF accelerate, no change is needed for model_name.

HF accelerate uses [LLM.int8()](https://arxiv.org/abs/2208.07339) and DS-inference uses [ZeroQuant](https://arxiv.org/abs/2206.01861) for post-training quantization.

## BLOOM inference via command-line

This asks for generate_kwargs everytime.
Example: generate_kwargs =
```json
{"min_length": 100, "max_new_tokens": 100, "do_sample": false}
```

1. using HF accelerate
```shell
python -m inference_server.cli --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework hf_accelerate --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}'
```

2. using DS inference
```shell
python -m inference_server.cli --model_name microsoft/bloom-deepspeed-inference-fp16 --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}'
```

## BLOOM server deployment

[make <model_name>](../Makefile) can be used to launch a generation server. Please note that the serving method is synchronous and users have to wait in queue until the preceding requests have been processed. An example to fire server requests is given [here](./server_request.py). Alternativey, a [Dockerfile](./Dockerfile) is also provided which launches a generation server on port 5000.

An interactive UI can be launched via the following command to connect to the generation server. The default URL of the UI is `http://127.0.0.1:5001/`. The `model_name` is just used by the UI to check if the model is decoder or encoder-decoder model.
```shell
python -m ui --model_name bigscience/bloom
```
This command launches the following UI to play with generation. Sorry for the crappy design. Unfotunately, my UI skills only go so far. 😅😅😅
![image](assets/UI.png)

## Benchmark system for BLOOM inference

1. using HF accelerate
```shell
python -m inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework hf_accelerate --benchmark_cycles 5
```

2. using DS inference
```shell
deepspeed --num_gpus 8 --module inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --benchmark_cycles 5
```
alternatively, to load model faster:
```shell
deepspeed --num_gpus 8 --module inference_server.benchmark --model_name microsoft/bloom-deepspeed-inference-fp16 --model_class AutoModelForCausalLM --dtype fp16 --deployment_framework ds_inference --benchmark_cycles 5
```

3. using DS ZeRO
```shell
deepspeed --num_gpus 8 --module inference_server.benchmark --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5
```

# Support


If you run into things not working or have other questions please open an Issue in the corresponding backend:

- [Accelerate](https://github.com/huggingface/accelerate/issues)
- [Deepspeed-Inference](https://github.com/microsoft/DeepSpeed/issues)
- [Deepspeed-ZeRO](https://github.com/microsoft/DeepSpeed/issues)

If there a specific issue with one of the scripts and not the backend only then please open an Issue here and tag [@mayank31398](https://github.com/mayank31398).


# Other inference solutions
## Client-side solutions

Solutions developed to perform large batch inference locally:

* [Custom HF Code](https://github.com/huggingface/transformers_bloom_parallel/).

JAX:

* [BLOOM Inference in JAX](https://github.com/huggingface/bloom-jax-inference)


## Server solutions

A solution developed to be used in a server mode (i.e. varied batch size, varied request rate) can be found [here](https://github.com/Narsil/bloomserver). This is implemented in Rust.


================================================
FILE: bloom-inference-scripts/README.md
================================================
# Inference scripts for BLOOM

## BLOOM Inference solutions

Here are some benchmark resuls on JeanZay's 8x80GB A100 node w/ 512GB of CPU memory:

All benchmarks are doing greedy generation of 100 token outputs:
```
Generate args {'max_length': 100, 'do_sample': False}
```
The input prompt is comprised of just a few tokens.

Throughput in msecs on 8x80GB gpus:

| project      \ bs |      1 |     8 |    16 |    32 |   64 |  128 |  256 | 512  |
| :---------------- | :----- | :---- | :---- | :---- | :--- | :--- | :--- | :--- |
| accelerate   bf16 | 230.38 | 31.78 | 17.84 | 10.89 |  oom |      |      |      |
| accelerate   int8 | 286.56 | 40.92 | 22.65 | 13.27 |  oom |      |      |      |
| ds-inference fp16 |  44.02 |  5.70 |  3.01 |  1.68 | 1.00 | 0.69 |  oom |      |
| ds-inference int8 |  89.09 | 11.44 |  5.88 |  3.09 | 1.71 | 1.02 | 0.71 | oom  |
| ds-zero      bf16 |    283 | 34.88 |   oom |       |      |      |      |      |

note: Since Deepspeed-ZeRO can process multiple generate streams in parallel its throughput can be further divided by 8 or 16, depending on whether 8 or 16 gpus were used during the generate. and, of course, it means that it can process a bs of 64 in the case of 8x80 A100 (the table above).

Start to ready to generate in secs (mainly loading and data preparation time):

| project                 |      |
| :---------------------- | :--- |
| accelerate              |  121 |
| ds-inference shard-int8 |   61 |
| ds-inference shard-fp16 |   60 |
| ds-inference unsharded  |  662 |
| ds-zero                 |  462 |

Now let's look at the power of quantized int8-based models provided by [Deepspeed-Inference](https://www.deepspeed.ai/tutorials/inference-tutorial/) and [BitsNBytes](https://github.com/TimDettmers/bitsandbytes), as it requires only half the original GPU memory of inference in bfloat16 or float16.

Throughput in msecs 4x80GB A100:

| project      \ bs |      1 |     8 |    16 |    32 |   64 | 128  |
| :---------------- | :----- | :---- | :---- | :---- | :--- | :--- |
| accelerate   int8 | 284.15 | 40.14 | 21.97 |  oom  |      |      |
| ds-inference int8 | 156.51 | 20.11 | 10.38 |  5.50 | 2.96 | oom  |

To get the benchmark results simply add `--benchmark` to any of these 3 scripts discussed below.


## Deepspeed-Inference

Deepspeed-Inference uses Tensor-Parallelism and efficient fused CUDA kernels:
https://www.deepspeed.ai/tutorials/inference-tutorial/

### Setup

```
pip install deepspeed>=0.7.3
```

### Run

1. the fastest approach is to use a tp-pre-sharded checkpoint that takes only ~1min to load, as compared to 10min for non-presharded bloom checkpoint


```
deepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-fp16
```

1a.
if you want to run the original bloom checkpoint, which once loaded will run at the same throughput as the previous solution, but the loading will take 10-20min:

```
deepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name bigscience/bloom
```

2a. The 8bit quantized version requires you to have only half the GPU memory of the normal half precision version:


```
deepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-int8 --dtype int8
```

Here we used `microsoft/bloom-deepspeed-inference-int8` and also told the script to run in `int8`.

And of course, just 4x80GB A100 gpus is now sufficient:

```
deepspeed --num_gpus 4 bloom-inference-scripts/bloom-ds-inference.py --name microsoft/bloom-deepspeed-inference-int8 --dtype int8
```


## HF Accelerate

HF Accelerate can use naive Pipeline Parallelism to load a huge model over multiple GPUs:
https://github.com/huggingface/accelerate

### Setup

```
pip install transformers>=4.21.3 accelerate>=0.12.0
```


### Run


```
python bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-accelerate-inference_bs=1.txt
```

To activate the 8bit quantized solution first install `bitsnbytes`:

```
pip install bitsandbytes
```

and then add `--dtype int8` to the previous command line:

```
python bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --dtype int8 --batch_size 1 --benchmark 2>&1 | tee bloom-int8-accelerate-inference_bs=1.txt
```

if you have more than 4 GPUs you can tell it to use only 4 with:
```
CUDA_VISIBLE_DEVICES=0,1,2,3 python bloom-inference-scripts/bloom-accelerate-inference.py --name bigscience/bloom --dtype int8 --batch_size 1 --benchmark 2>&1 | tee bloom-int8-accelerate-inference_bs=1.txt
```


## Deepspeed ZeRO-Inference


[Deepspeed ZeRO](https://www.deepspeed.ai/tutorials/zero/) uses a magical sharding approach which can take almost any model and scale it across a few or hundreds of GPUs.

### Setup

```
pip install deepspeed
```


### Run

Note that the script currently runs the same inputs on all GPUs, but you can run a different stream on each GPU, and get `n_gpu` times faster throughput. You can't do that with Deepspeed-Inference.


```
deepspeed --num_gpus 8 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt
```

Please remember that with ZeRO the user can generate multiple unique streams at the same time - and thus the overall performance should be throughput in secs/token divided by number of participating gpus - so 8x to 16x faster depending on whether 8 or 16 gpus were used!

You can also try the offloading solutions with just one small GPU, which will take a long time to run, but if you don't have 8 huge GPUs this is as good as it gets.


CPU-Offload (1x gpus):
```
deepspeed --num_gpus 1 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --cpu_offload --benchmark 2>&1 | tee bloom-ds-zero-inference-cpu_offload_bs=8.txt
```

NVMe-Offload (1x gpus):
```
deepspeed --num_gpus 1 bloom-inference-scripts/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --nvme_offload_path=/path/to/nvme_offload --benchmark 2>&1 | tee bloom-ds-zero-inference-nvme_offload_bs=8.txt
```

make sure to adjust `/path/to/nvme_offload` to somewhere you have ~400GB of free memory on a fast NVMe drive.

## Support

If you run into things not working or have other questions please open an Issue in the corresponding backend:

- [Accelerate](https://github.com/huggingface/accelerate/issues)
- [Deepspeed-Inference](https://github.com/microsoft/DeepSpeed/issues)
- [Deepspeed-ZeRO](https://github.com/microsoft/DeepSpeed/issues)

If there a specific issue with one of the scripts and not the backend only then please open an Issue here and tag [@stas00](https://github.com/stas00).


================================================
FILE: bloom-inference-scripts/bloom-accelerate-inference.py
================================================
import argparse
import gc
import math
import os
import time

import torch
import torch.distributed as dist

from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers")
    parser.add_argument("--name", type=str, help="Name path", required=True)
    parser.add_argument("--batch_size", default=1, type=int, help="batch size")
    parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark")
    parser.add_argument("--greedy", action="store_true")
    parser.add_argument("--top-k", type=int, default=0)
    parser.add_argument("--top-p", type=float, default=0.0)
    parser.add_argument("--dtype", type=str, help="float16 or int8", choices=["int8", "float16"], default="float16")

    return parser.parse_args()


t_start = time.time()

num_tokens = 100

args = get_args()

local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = torch.cuda.device_count()

rank = local_rank


def print_rank0(*msg):
    if rank != 0:
        return
    print(*msg)


print_rank0(f"Using {world_size} gpus")
model_name = args.name
print_rank0(f"Loading model {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)

# XXX: can't automatically derive dtype via config's `from_pretrained`
dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16

# print(get_max_memory_per_gpu_dict())

infer_dtype = args.dtype
if infer_dtype == "int8":
    dtype = torch.int8

kwargs = dict(
    device_map="auto",
)


def get_world_size() -> int:
    if dist.is_initialized():
        return dist.get_world_size()
    else:
        return 1


# balanced_low_0 - because it allows a larger batch size with multiple GPUs
if get_world_size() > 1:
    kwargs["device_map"] = "balanced_low_0"


if infer_dtype == "int8":
    print_rank0("Using `load_in_8bit=True` to use quanitized model")
    kwargs["load_in_8bit"] = True
else:
    kwargs["torch_dtype"] = dtype


model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)


if args.benchmark:
    t_ready = time.time()


### Generate

print_rank0(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}")

input_sentences = [
    "DeepSpeed is a machine learning framework",
    "He is working on",
    "He has a",
    "He got all",
    "Everyone is happy and I can",
    "The new movie that got Oscar this year",
    "In the far far distance from our galaxy,",
    "Peace is the only way",
]

if args.batch_size > len(input_sentences):
    # dynamically extend to support larger bs by repetition
    input_sentences *= math.ceil(args.batch_size / len(input_sentences))

generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)
# generate_kwargs = dict(max_new_tokens=num_tokens, use_cache=False, do_sample=False)
# generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False)

print_rank0(f"Generate args {generate_kwargs}")
inputs = input_sentences[: args.batch_size]


def generate():
    """returns a list of zipped inputs, outputs and number of new tokens"""

    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to("cuda:0")

    outputs = model.generate(**input_tokens, **generate_kwargs)

    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
    output_tokens_lengths = [x.shape[0] for x in outputs]

    total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)]
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return zip(inputs, outputs, total_new_tokens)


print_rank0("*** Running generate")
t_generate_start = time.time()
generated = generate()
t_generate_span = time.time() - t_generate_start
for i, o, _ in generated:
    print_rank0(f"{'-'*60}\nin={i}\nout={o}\n")


### Benchmark

if args.benchmark:
    # clear cache / free memory
    torch.cuda.empty_cache()
    gc.collect()

    print_rank0("*** Running benchmark")
    # warm up
    for i in range(1):
        _ = generate()
    torch.cuda.synchronize()

    # benchmark
    t0 = time.time()
    cycles = 5
    total_new_tokens_generated = 0
    for i in range(cycles):
        generated = generate()
        total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated)
    torch.cuda.synchronize()
    throughput = (time.time() - t0) / (total_new_tokens_generated)
    print_rank0(
        f"""
*** Performance stats:
Throughput per token including tokenize: {throughput*1000:.2f} msecs
Start to ready to generate: {t_ready - t_start:.3f} secs
Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs
Start to finish: {t_ready - t_start + t_generate_span:.3f} secs
"""
    )


================================================
FILE: bloom-inference-scripts/bloom-ds-inference.py
================================================
# usage:
# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom
#
# to run benchmarks:
# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark
#


# This is going to improve, but at the moment, the process is a bit cumbersome - we first use
# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints,
# 2. free the allocated storage
# 3. start Deepspeed-Inference and only now load the checkpoint
# 4. run generate
# Done.
#


import gc
import io
import json
import math
import os
import time
from argparse import ArgumentParser
from pathlib import Path

import torch
import torch.distributed as dist

import deepspeed
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock
from transformers.utils import is_offline_mode


# the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time.
tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"]

t_start = time.time()

num_tokens = 100

parser = ArgumentParser()

parser.add_argument("--name", required=True, type=str, help="model_name")
parser.add_argument("--dtype", type=str, help="float16 or int8", choices=["int8", "float16"], default="float16")
parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers")
parser.add_argument("--batch_size", default=1, type=int, help="batch size")
parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark")
args = parser.parse_args()

local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))

deepspeed.init_distributed("nccl")
rank = dist.get_rank()


def print_rank0(*msg):
    if rank != 0:
        return
    print(*msg)


### Model loading and instantiating on GPUs


def get_repo_root(model_name_or_path):
    # checks if online or not
    if is_offline_mode():
        print_rank0("Offline mode: forcing local_files_only=True")

    # download only on first process
    if rank == 0:
        snapshot_download(
            model_name_or_path,
            local_files_only=is_offline_mode(),
            cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
            ignore_patterns=["*.safetensors"],
        )

    dist.barrier()

    return snapshot_download(
        model_name_or_path,
        local_files_only=is_offline_mode(),
        cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
        ignore_patterns=["*.safetensors"],
    )


def get_checkpoint_files(model_name_or_path):
    cached_repo_dir = get_repo_root(model_name_or_path)

    # extensions: .bin | .pt
    # creates a list of paths from all downloaded files in cache dir
    file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()]
    return file_list


model_name = args.name
infer_dtype = args.dtype

tp_presharded_mode = True if model_name in tp_presharded_models else False

# print(get_checkpoint_files(model_name))

print_rank0(f"*** Loading the model {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

# XXX: can't automatically derive dtype via config's `from_pretrained`
# dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16


# use one of these args to `init_inference`
# 1. injection_policy is the slower version, but it's plain pytorch so it'll always work
# 2. replace_with_kernel_inject is the faster one (fast fused kernels)
kernel_inject = True
# kernel_inject = False

if kernel_inject:
    # XXX: for now ds-inference only works with fp16
    dtype = torch.float16
else:
    dtype = torch.bfloat16

if args.benchmark:
    torch.cuda.empty_cache()
    gc.collect()
    deepspeed.runtime.utils.see_memory_usage("pre-from-pretrained", force=True)

# Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load
with deepspeed.OnDevice(dtype=dtype, device="meta"):
    model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)

if args.benchmark:
    deepspeed.runtime.utils.see_memory_usage("post-from-pretrained", force=True)

model = model.eval()

if args.benchmark:
    torch.cuda.empty_cache()
    gc.collect()
    deepspeed.runtime.utils.see_memory_usage("post-init-ds-zero-init", force=True)

### Deepspeed-Inference Loading

checkpoints_json = "checkpoints.json"


def write_checkpoints_json():
    checkpoint_files = get_checkpoint_files(model_name)
    if rank == 0:
        data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0}
        json.dump(data, open(checkpoints_json, "w"))


if args.benchmark:
    torch.cuda.empty_cache()
    gc.collect()
    deepspeed.runtime.utils.see_memory_usage("pre-ds-inference-init", force=True)

if kernel_inject:
    kwargs = dict(replace_with_kernel_inject=True)
else:
    kwargs = dict(injection_policy={BloomBlock: ("self_attention.dense", "mlp.dense_4h_to_h")})

repo_root = get_repo_root(model_name)
if tp_presharded_mode:
    # tp presharded repos come with their own checkpoints config file
    checkpoints_json = os.path.join(repo_root, "ds_inference_config.json")
else:
    # for normal bloom repo we need to write the checkpoints config file
    write_checkpoints_json()
    dist.barrier()

# checkpoints_json=None
model = deepspeed.init_inference(
    model,
    mp_size=world_size,
    base_dir=repo_root,
    dtype=getattr(torch, infer_dtype),
    checkpoint=checkpoints_json,
    **kwargs,
)

if args.benchmark:
    torch.cuda.empty_cache()
    gc.collect()
    deepspeed.runtime.utils.see_memory_usage("post-ds-inference-init", force=True)


model = model.module

if args.benchmark:
    t_ready = time.time()


### Generate


print_rank0(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}")

input_sentences = [
    "DeepSpeed is a machine learning framework",
    "He is working on",
    "He has a",
    "He got all",
    "Everyone is happy and I can",
    "The new movie that got Oscar this year",
    "In the far far distance from our galaxy,",
    "Peace is the only way",
]

if args.batch_size > len(input_sentences):
    # dynamically extend to support larger bs by repetition
    input_sentences *= math.ceil(args.batch_size / len(input_sentences))

generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)


print_rank0(f"Generate args {generate_kwargs}")

inputs = input_sentences[: args.batch_size]


def generate():
    """returns a list of zipped inputs, outputs and number of new tokens"""

    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())

    outputs = model.generate(**input_tokens, **generate_kwargs)

    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
    output_tokens_lengths = [x.shape[0] for x in outputs]

    total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)]
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return zip(inputs, outputs, total_new_tokens)


# warmup is a must if measuring speed as it's when all the optimizations are performed
# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
print_rank0("*** Running generate warmup")
_ = generate()

print_rank0("*** Running generate")
t_generate_start = time.time()
generated = generate()
t_generate_span = time.time() - t_generate_start
for i, o, _ in generated:
    print_rank0(f"{'-'*60}\nin={i}\nout={o}\n")

if args.benchmark:
    torch.cuda.empty_cache()
    gc.collect()
    deepspeed.runtime.utils.see_memory_usage("end-of-run", force=True)

### Benchmark

# benchmark it!
if args.benchmark:
    print_rank0("*** Running benchmark")

    # warm up
    for i in range(1):
        _ = generate()
    torch.cuda.synchronize()

    # benchmark
    t0 = time.time()
    cycles = 5
    total_new_tokens_generated = 0
    for i in range(cycles):
        generated = generate()
        total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated)
    torch.cuda.synchronize()
    throughput = (time.time() - t0) / (total_new_tokens_generated)
    print_rank0(
        f"""
*** Performance stats:
Throughput per token including tokenize: {throughput*1000:.2f} msecs
Start to ready to generate: {t_ready - t_start:.3f} secs
Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs
Start to finish: {t_ready - t_start + t_generate_span:.3f} secs
"""
    )


================================================
FILE: bloom-inference-scripts/bloom-ds-zero-inference.py
================================================
# usage:
# deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom
#
# to run benchmarks:
# deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --benchmark
#


# This is going to improve, but at the moment, the process is a bit cumbersome - we first use
# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints,
# 2. free the allocated storage
# 3. start Deepspeed-Inference and only now load the checkpoint
# 4. run generate
# Done.
#


import gc
import math
import os
import time
from argparse import ArgumentParser

import torch
import torch.distributed as dist

import deepspeed
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.deepspeed import HfDeepSpeedConfig
from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock


t_start = time.time()

num_tokens = 100

parser = ArgumentParser()

parser.add_argument("--name", required=True, type=str, help="model_name")
parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers")
parser.add_argument("--batch_size", default=1, type=int, help="batch size")
parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark")
parser.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload")
parser.add_argument("--nvme_offload_path", help="whether to activate NVME offload and the path on nvme")
args = parser.parse_args()

local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))

deepspeed.init_distributed("nccl")
rank = dist.get_rank()


def print_rank0(*msg):
    if rank != 0:
        return
    print(*msg)


### Model loading and instantiating on GPU (via ZeRO)

model_name = args.name

print_rank0(f"*** Loading the model {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

# XXX: can't automatically derive dtype via config's `from_pretrained`
dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16

model_hidden_size = config.hidden_size
train_batch_size = 1 * world_size

ds_config = {
    "fp16": {
        "enabled": dtype == torch.float16,
    },
    "bf16": {
        "enabled": dtype == torch.bfloat16,
    },
    "zero_optimization": {
        "stage": 3,
        "overlap_comm": True,
        "contiguous_gradients": True,
        "reduce_bucket_size": model_hidden_size * model_hidden_size,
        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
        "stage3_param_persistence_threshold": 0,
    },
    "steps_per_print": 2000,
    "train_batch_size": train_batch_size,
    "train_micro_batch_size_per_gpu": 1,
    "wall_clock_breakdown": False,
}

if args.cpu_offload and args.nvme_offload_path:
    raise ValueError("Use one of --cpu_offload or --nvme_offload_path and not both")

if args.cpu_offload:
    ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True)

if args.nvme_offload_path:
    ds_config["zero_optimization"]["offload_param"] = dict(
        device="nvme",
        pin_memory=True,
        nvme_path=args.nvme_offload_path,
        buffer_size=4e9,
    )

dschf = HfDeepSpeedConfig(ds_config)  # this tells from_pretrained to instantiate directly on gpus

if args.benchmark:
    torch.cuda.empty_cache()
    gc.collect()
    deepspeed.runtime.utils.see_memory_usage("pre-from-pretrained", force=True)

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

if args.benchmark:
    deepspeed.runtime.utils.see_memory_usage("post-from-pretrained", force=True)

model = model.eval()

print_rank0(ds_config)

ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
ds_engine.module.eval()
model = ds_engine.module

if args.benchmark:
    t_ready = time.time()
    deepspeed.runtime.utils.see_memory_usage("start-of-generate", force=True)


### Generate

print_rank0(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}")

input_sentences = [
    "DeepSpeed is a machine learning framework",
    "He is working on",
    "He has a",
    "He got all",
    "Everyone is happy and I can",
    "The new movie that got Oscar this year",
    "In the far far distance from our galaxy,",
    "Peace is the only way",
]

if args.batch_size > len(input_sentences):
    # dynamically extend to support larger bs by repetition
    input_sentences *= math.ceil(args.batch_size / len(input_sentences))

generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)
# Important: if using multiple unique streams to avoid hanging if one generation finished early - one must also add:
# generate_kwargs.update(synced_gpus=True)

print_rank0(f"Generate args {generate_kwargs}")
inputs = input_sentences[: args.batch_size]


def generate():
    """returns a list of zipped inputs, outputs and number of new tokens"""

    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())

    outputs = model.generate(**input_tokens, **generate_kwargs)

    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
    output_tokens_lengths = [x.shape[0] for x in outputs]

    total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)]
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return zip(inputs, outputs, total_new_tokens)


# XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size

print_rank0("*** Running generate")
t_generate_start = time.time()
pairs = generate()
t_generate_span = time.time() - t_generate_start
for i, o, _ in pairs:
    print_rank0(f"{'-'*60}\nin={i}\nout={o}\n")


### Benchmark

if args.benchmark:
    # clear cache / free memory
    torch.cuda.empty_cache()
    gc.collect()
    deepspeed.runtime.utils.see_memory_usage("end-of-generate", force=True)

    print_rank0("*** Running benchmark")

    # warm up
    for i in range(1):
        _ = generate()
    torch.cuda.synchronize()

    # benchmark
    t0 = time.time()
    cycles = 5
    total_new_tokens_generated = 0
    for i in range(cycles):
        generated = generate()
        total_new_tokens_generated += sum(new_tokens for _, _, new_tokens in generated)

    torch.cuda.synchronize()
    # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs)
    total_new_tokens_generated *= world_size
    throughput = (time.time() - t0) / (total_new_tokens_generated)
    print_rank0(
        f"""
*** Performance stats:
Throughput per token including tokenize: {throughput*1000:.2f} msecs
Start to ready to generate: {t_ready - t_start:.3f} secs
Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs
Start to finish: {t_ready - t_start + t_generate_span:.3f} secs
"""
    )


================================================
FILE: inference_server/benchmark.py
================================================
import argparse
import gc
from functools import partial

import torch

from .constants import DS_INFERENCE, DS_ZERO
from .model_handler.deployment import ModelDeployment
from .models import start_inference_engine
from .utils import (
    GenerateRequest,
    create_generate_request,
    get_argument_parser,
    get_dummy_batch,
    get_world_size,
    parse_args,
    print_rank_0,
    run_and_log_time,
)


def benchmark_generation(model: ModelDeployment, request: GenerateRequest, cycles: int = 5):
    # run benchmarks for number of cycles
    total_new_tokens_generated = 0
    for _ in range(cycles):
        response = model.generate(request=request)
        total_new_tokens_generated += sum(new_tokens for new_tokens in response.num_generated_tokens)
    return total_new_tokens_generated


def get_benchmark_results(
    benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int
) -> str:
    throughput = total_new_tokens_generated / benchmark_time
    latency = benchmark_time / cycles
    return f"""
*** Performance stats:
Throughput (including tokenization) = {throughput:.2f} tokens/sec
Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token
Model loading time = {initialization_time:.2f} secs
Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size}
Latency = {latency:.2f} secs
Model loading time + generation time per batch = {initialization_time + latency:.2f} secs
"""


def benchmark_end_to_end(args: argparse.Namespace) -> None:
    model, initialization_time = run_and_log_time(partial(ModelDeployment, args=args, grpc_allowed=False))

    request = create_generate_request(get_dummy_batch(args.batch_size), args.generate_kwargs)

    print_rank_0(f"generate_kwargs = {args.generate_kwargs}")
    print_rank_0(f"batch_size = {args.batch_size}")

    # warmup is a must if measuring speed as it's when all the optimizations are performed
    # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
    response = model.generate(request=request)

    for i, (o, _) in zip(request.text, zip(response.text, response.num_generated_tokens)):
        print_rank_0(f"{'-' * 60}\nin = {i}\nout = {o}\n")

    if args.benchmark_cycles > 0:
        print_rank_0("*** Running benchmark")

        torch.cuda.empty_cache()
        gc.collect()

        # warm up
        model.generate(request=request)
        torch.cuda.synchronize()

        # benchmark
        total_new_tokens_generated, benchmark_time = run_and_log_time(
            partial(benchmark_generation, model=model, request=request, cycles=args.benchmark_cycles)
        )

        # with ZeRO every GPU is generating batch_size * sequence_length tokens
        if args.deployment_framework == DS_ZERO:
            total_new_tokens_generated *= get_world_size()

        print_rank_0(
            get_benchmark_results(
                benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles
            )
        )


def get_args() -> argparse.Namespace:
    parser = get_argument_parser()

    group = parser.add_argument_group(title="launch config")
    group.add_argument("--benchmark_cycles", type=int, default=0, help="additionally run benchmark")
    group.add_argument("--local_rank", required=False, type=int, help="used by dist launchers")
    group.add_argument("--batch_size", default=1, type=int, help="batch size")
    group.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload for DS ZeRO")

    args = parse_args(parser)

    launched_with_deepspeed = args.deployment_framework in [DS_INFERENCE, DS_ZERO]

    assert args.max_batch_size == None, "max_batch_size is not supported with benchmark"

    if not launched_with_deepspeed:
        assert args.local_rank == None, "local_rank must be None if not launched with DeepSpeed"

    if args.cpu_offload:
        assert args.deployment_framework == DS_ZERO, "cpu_offload only works with DS_ZeRO"

    return args


def main() -> None:
    args = get_args()
    start_inference_engine(args.deployment_framework)
    benchmark_end_to_end(args)


if __name__ == "__main__":
    main()


================================================
FILE: inference_server/cli.py
================================================
import argparse
import json
import sys

from .model_handler import ModelDeployment
from .utils import get_argument_parser, parse_args, print_rank_0


def get_args() -> argparse.Namespace:
    parser = get_argument_parser()
    args = parse_args(parser)
    return args


def main() -> None:
    args = get_args()

    model = ModelDeployment(args, True)

    generate_kwargs = args.generate_kwargs

    while True:
        input_text = input("Input text: ")

        if input("change generate_kwargs? [y/n] ") == "y":
            while True:
                try:
                    generate_kwargs = json.loads(input("Generate kwargs: "))
                    break
                except Exception as e:
                    e_type, e_message, _ = sys.exc_info()
                    print("error =", e_type.__name__)
                    print("message =", e_message)
                    continue

        response = model.generate(text=[input_text], generate_kwargs=generate_kwargs)

        print_rank_0("Output text:", response.text[0])
        print_rank_0("Generated tokens:", response.num_generated_tokens[0])


if __name__ == "__main__":
    main()


================================================
FILE: inference_server/constants.py
================================================
# inference method (args.deployment_framework)
HF_ACCELERATE = "hf_accelerate"
HF_CPU = "hf_cpu"
DS_INFERENCE = "ds_inference"
DS_ZERO = "ds_zero"

# GRPC_MAX_MSG_SIZE = 2**30  # 1GB


================================================
FILE: inference_server/download_model.py
================================================
import argparse

from inference_server.models import get_hf_model_class
from transformers import AutoConfig, AutoTokenizer


def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--model_name",
        type=str,
        required=True,
        help="model to use",
    )
    parser.add_argument(
        "--model_class",
        type=str,
        required=True,
        help="model class to use",
    )

    args = parser.parse_args()

    return args


def main() -> None:
    args = get_args()
    print("downloading", args.model_name)
    AutoConfig.from_pretrained(args.model_name)
    AutoTokenizer.from_pretrained(args.model_name)
    get_hf_model_class(args.model_class).from_pretrained(args.model_name)


if __name__ == "__main__":
    main()


================================================
FILE: inference_server/model_handler/__init__.py
================================================
from .deployment import ModelDeployment


================================================
FILE: inference_server/model_handler/deployment.py
================================================
"""
Copyright 2022 The Microsoft DeepSpeed Team
"""
import argparse
import asyncio
import subprocess
import time
from typing import List

import grpc

from ..constants import DS_INFERENCE, DS_ZERO
from ..models import get_model_class, load_tokenizer
from ..utils import (
    ForwardRequest,
    ForwardResponse,
    GenerateResponse,
    TokenizeRequest,
    TokenizeResponse,
    create_generate_request,
    get_cuda_visible_devices,
    get_str_dtype,
    get_world_size,
    print_rank_0,
)
from .grpc_utils.pb import generation_pb2, generation_pb2_grpc


class ModelDeployment:
    def __init__(self, args: argparse.Namespace, grpc_allowed: bool = False):
        self.cuda_visible_devices = get_cuda_visible_devices()
        self.num_gpus = get_world_size()

        self.use_grpc_server = self.should_use_grpc(args.deployment_framework, grpc_allowed)

        if self.use_grpc_server:
            self.tokenizer = load_tokenizer(args.model_name)

            self.initialize_ports()

            self.dtype_proto_field = {
                str: "svalue",
                int: "ivalue",
                float: "fvalue",
                bool: "bvalue",
            }

            self._initialize_service(args)
            self._wait_until_server_is_live()

            self.asyncio_loop = asyncio.get_event_loop()
            self._initialize_grpc_client()
        else:
            self.model = get_model_class(args.deployment_framework)(args)

        print_rank_0("model loaded")

    def should_use_grpc(self, deployment_framework: str, grpc_allowed: bool) -> bool:
        if grpc_allowed and get_world_size() > 1:
            return deployment_framework in [DS_INFERENCE, DS_ZERO]
        return False

    def initialize_ports(self):
        self.ports = []
        for i in range(self.num_gpus):
            self.ports.append(50950 + self.cuda_visible_devices[i])

    def _is_socket_open(self, port):
        import socket

        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        result = sock.connect_ex(("0.0.0.0", port))
        sock.close()
        return result == 0

    def _is_server_process_alive(self):
        if self.process is None:
            return True
        try:
            self.process.wait(1)
        except subprocess.TimeoutExpired as err:
            # timeout means we're still running and all (probably) okay
            is_alive = True
        else:
            # no exception case
            is_alive = False
        return is_alive

    def _wait_until_server_is_live(self):
        sockets_open = False
        while not sockets_open:
            sockets_open = self._is_socket_open(self.ports[0])
            process_alive = self._is_server_process_alive()
            if not process_alive:
                raise RuntimeError("server crashed for some reason, unable to proceed")
            time.sleep(4)
            print_rank_0("waiting for server to start...")
        print_rank_0(f"server has started on {self.ports[0]}")

    def dict_to_proto(self, generate_kwargs: dict) -> dict:
        result = {}
        for k, v in generate_kwargs.items():
            if v is not None:
                x = generation_pb2.Value()
                setattr(x, self.dtype_proto_field[type(v)], v)
                result[k] = x

        return result

    def _initialize_service(self, args: argparse.Namespace):
        if self._is_socket_open(self.ports[0]):
            raise RuntimeError(
                f"Server is already running on port {self.ports}, please shutdown or use different port."
            )

        if args.deployment_framework in [DS_INFERENCE, DS_ZERO]:
            ports = " ".join(map(str, self.ports))

            cmd = f"inference_server.model_handler.launch --model_name {args.model_name} --deployment_framework {args.deployment_framework} --dtype {get_str_dtype(args.dtype)} --port {ports} --model_class {args.model_class}"

            if args.max_batch_size is not None:
                cmd += f" --max_batch_size {args.max_batch_size}"
            if args.max_input_length is not None:
                cmd += f" --max_input_length {args.max_input_length}"

            master_port = 29500 + min(self.cuda_visible_devices)

            cuda_visible_devices = ",".join(map(str, self.cuda_visible_devices))

            cmd = f"deepspeed --master_port {master_port} --include localhost:{cuda_visible_devices} --module {cmd}"
        else:
            raise NotImplementedError(f"unsupported deployment_framework: {args.deployment_framework}")

        cmd = cmd.split(" ")
        self.process = subprocess.Popen(cmd)

    def _initialize_grpc_client(self):
        self.stubs = []
        for i in self.ports:
            channel = grpc.aio.insecure_channel(f"localhost:{i}")
            stub = generation_pb2_grpc.GenerationServiceStub(channel)
            self.stubs.append(stub)

    # runs task in parallel and return the result from the first task
    async def generate_in_tensor_parallel(self, text: List[str], generate_kwargs: dict):
        responses = []
        for i in range(self.num_gpus):
            responses.append(self.asyncio_loop.create_task(self.generate_async(i, text, generate_kwargs)))

        await responses[0]
        return responses[0]

    async def generate_async(self, stub_id: int, text: List[str], generate_kwargs: dict):
        req = generation_pb2.GenerationRequestProto(texts=text, generate_kwargs=generate_kwargs)
        response = await self.stubs[stub_id].Generate(req)
        return response

    # runs task in parallel and return the result from the first task
    async def forward_in_tensor_parallel(self, conditioning_text: List[str], response: List[str]):
        responses = []
        for i in range(self.num_gpus):
            responses.append(self.asyncio_loop.create_task(self.forward_async(i, conditioning_text, response)))

        await responses[0]
        return responses[0]

    async def forward_async(self, stub_id: int, conditioning_text: List[str], response: List[str]):
        req = generation_pb2.ForwardRequestProto(conditioning_text=conditioning_text, response=response)
        response = await self.stubs[stub_id].Forward(req)
        return response

    def generate(self, **kwargs) -> GenerateResponse:
        if self.use_grpc_server:
            if "request" in kwargs:
                text = kwargs["request"].text
                generate_kwargs = kwargs["request"].get_generate_kwargs()
            else:
                text = kwargs["text"]
                generate_kwargs = kwargs["generate_kwargs"]

            generate_kwargs = self.dict_to_proto(generate_kwargs)

            response = self.asyncio_loop.run_until_complete(
                self.generate_in_tensor_parallel(text, generate_kwargs)
            ).result()

            if response.error:
                raise Exception(response.error)
            else:
                return GenerateResponse(
                    text=[r for r in response.texts], num_generated_tokens=[n for n in response.num_generated_tokens]
                )
        else:
            if "request" in kwargs:
                request = kwargs["request"]
            else:
                request = create_generate_request(**kwargs)

            response = self.model.generate(request)

            if isinstance(response, Exception):
                raise response
            else:
                return response

    def forward(self, request: ForwardRequest) -> ForwardResponse:
        if self.use_grpc_server:
            response = self.asyncio_loop.run_until_complete(
                self.forward_in_tensor_parallel(request.conditioning_text, request.response)
            ).result()

            if response.error:
                raise Exception(response.error)
            else:
                return ForwardResponse(nll=response.nll)
        else:
            response = self.model.forward(request)

            if isinstance(response, Exception):
                raise response
            else:
                return response

    def tokenize(self, request: TokenizeRequest) -> TokenizeResponse:
        if self.use_grpc_server:
            response = self.tokenizer(request.text, padding=request.padding)
            response = TokenizeResponse(token_ids=response.input_ids, attention_mask=response.attention_mask)
        else:
            response = self.model.tokenize(request)

        return response


================================================
FILE: inference_server/model_handler/grpc_utils/__init__.py
================================================


================================================
FILE: inference_server/model_handler/grpc_utils/generation_server.py
================================================
import os
from concurrent import futures

import torch

import grpc

# from ...constants import GRPC_MAX_MSG_SIZE
from ...models import Model
from ...utils import ForwardRequest, TokenizeRequest, create_generate_request, print_rank_0
from .pb import generation_pb2, generation_pb2_grpc


class GenerationServer(generation_pb2_grpc.GenerationServiceServicer):
    def __init__(self, model: Model) -> None:
        self.model = model

    def _unpack_proto_query_kwargs(self, query_kwargs):
        query_kwargs = {k: getattr(v, v.WhichOneof("oneof_values")) for k, v in query_kwargs.items()}
        return query_kwargs

    def Generate(self, request, context):
        text = [r for r in request.texts]
        generate_kwargs = self._unpack_proto_query_kwargs(request.generate_kwargs)

        request = create_generate_request(text=text, generate_kwargs=generate_kwargs)

        local_rank = int(os.getenv("LOCAL_RANK", "0"))
        torch.cuda.set_device(local_rank)
        self.model.input_device = local_rank

        response = self.model.generate(request)

        if isinstance(response, Exception):
            # if exception occurs, we don't this subprocess to crash
            response = generation_pb2.GenerationResponseProto(
                error=str(response), is_encoder_decoder=response.is_encoder_decoder
            )
        else:
            response = generation_pb2.GenerationResponseProto(
                texts=response.text,
                num_generated_tokens=response.num_generated_tokens,
                is_encoder_decoder=response.is_encoder_decoder,
            )

        return response

    def Forward(self, request, context):
        conditioning_text = [r for r in request.conditioning_text]
        response = [r for r in request.response]

        request = ForwardRequest(conditioning_text=conditioning_text, response=response)

        local_rank = int(os.getenv("LOCAL_RANK", "0"))
        torch.cuda.set_device(local_rank)
        self.model.input_device = local_rank

        response = self.model.forward(request)

        if isinstance(response, Exception):
            # if exception occurs, we don't this subprocess to crash
            response = generation_pb2.ForwardResponseProto(
                error=str(response), is_encoder_decoder=response.is_encoder_decoder
            )
        else:
            response = generation_pb2.ForwardResponseProto(
                nll=response.nll, is_encoder_decoder=response.is_encoder_decoder
            )

        return response


def serve(inference_pipeline, port):
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=1),
        # options=[
        #     ("grpc.max_send_message_length", GRPC_MAX_MSG_SIZE),
        #     ("grpc.max_receive_message_length", GRPC_MAX_MSG_SIZE),
        # ],
    )
    generation_pb2_grpc.add_GenerationServiceServicer_to_server(GenerationServer(inference_pipeline), server)
    server.add_insecure_port(f"[::]:{port}")
    print_rank_0("About to start server")
    server.start()
    print_rank_0("Started")
    server.wait_for_termination()


================================================
FILE: inference_server/model_handler/grpc_utils/pb/__init__.py
================================================


================================================
FILE: inference_server/model_handler/grpc_utils/pb/generation_pb2.py
================================================
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler.  DO NOT EDIT!
# source: generation.proto
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder


# @@protoc_insertion_point(imports)

_sym_db = _symbol_database.Default()


DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
    b'\n\x10generation.proto\x12\ngeneration"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values"\xc2\x01\n\x16GenerationRequestProto\x12\r\n\x05texts\x18\x01 \x03(\t\x12O\n\x0fgenerate_kwargs\x18\x02 \x03(\x0b\x32\x36.generation.GenerationRequestProto.GenerateKwargsEntry\x1aH\n\x13GenerateKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12 \n\x05value\x18\x02 \x01(\x0b\x32\x11.generation.Value:\x02\x38\x01"q\n\x17GenerationResponseProto\x12\r\n\x05texts\x18\x01 \x03(\t\x12\x1c\n\x14num_generated_tokens\x18\x02 \x03(\x05\x12\r\n\x05\x65rror\x18\x03 \x01(\t\x12\x1a\n\x12is_encoder_decoder\x18\x04 \x01(\x08"B\n\x13\x46orwardRequestProto\x12\x19\n\x11\x63onditioning_text\x18\x01 \x03(\t\x12\x10\n\x08response\x18\x02 \x03(\t"N\n\x14\x46orwardResponseProto\x12\x0b\n\x03nll\x18\x01 \x01(\x02\x12\r\n\x05\x65rror\x18\x02 \x01(\t\x12\x1a\n\x12is_encoder_decoder\x18\x03 \x01(\x08\x32\xba\x01\n\x11GenerationService\x12U\n\x08Generate\x12".generation.GenerationRequestProto\x1a#.generation.GenerationResponseProto"\x00\x12N\n\x07\x46orward\x12\x1f.generation.ForwardRequestProto\x1a .generation.ForwardResponseProto"\x00\x62\x06proto3'
)

_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "generation_pb2", globals())
if _descriptor._USE_C_DESCRIPTORS == False:
    DESCRIPTOR._options = None
    _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._options = None
    _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_options = b"8\001"
    _VALUE._serialized_start = 32
    _VALUE._serialized_end = 127
    _GENERATIONREQUESTPROTO._serialized_start = 130
    _GENERATIONREQUESTPROTO._serialized_end = 324
    _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_start = 252
    _GENERATIONREQUESTPROTO_GENERATEKWARGSENTRY._serialized_end = 324
    _GENERATIONRESPONSEPROTO._serialized_start = 326
    _GENERATIONRESPONSEPROTO._serialized_end = 439
    _FORWARDREQUESTPROTO._serialized_start = 441
    _FORWARDREQUESTPROTO._serialized_end = 507
    _FORWARDRESPONSEPROTO._serialized_start = 509
    _FORWARDRESPONSEPROTO._serialized_end = 587
    _GENERATIONSERVICE._serialized_start = 590
    _GENERATIONSERVICE._serialized_end = 776
# @@protoc_insertion_point(module_scope)


================================================
FILE: inference_server/model_handler/grpc_utils/pb/generation_pb2_grpc.py
================================================
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc

from . import generation_pb2 as generation__pb2


class GenerationServiceStub(object):
    """Missing associated documentation comment in .proto file."""

    def __init__(self, channel):
        """Constructor.

        Args:
            channel: A grpc.Channel.
        """
        self.Generate = channel.unary_unary(
            "/generation.GenerationService/Generate",
            request_serializer=generation__pb2.GenerationRequestProto.SerializeToString,
            response_deserializer=generation__pb2.GenerationResponseProto.FromString,
        )
        self.Forward = channel.unary_unary(
            "/generation.GenerationService/Forward",
            request_serializer=generation__pb2.ForwardRequestProto.SerializeToString,
            response_deserializer=generation__pb2.ForwardResponseProto.FromString,
        )


class GenerationServiceServicer(object):
    """Missing associated documentation comment in .proto file."""

    def Generate(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details("Method not implemented!")
        raise NotImplementedError("Method not implemented!")

    def Forward(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details("Method not implemented!")
        raise NotImplementedError("Method not implemented!")


def add_GenerationServiceServicer_to_server(servicer, server):
    rpc_method_handlers = {
        "Generate": grpc.unary_unary_rpc_method_handler(
            servicer.Generate,
            request_deserializer=generation__pb2.GenerationRequestProto.FromString,
            response_serializer=generation__pb2.GenerationResponseProto.SerializeToString,
        ),
        "Forward": grpc.unary_unary_rpc_method_handler(
            servicer.Forward,
            request_deserializer=generation__pb2.ForwardRequestProto.FromString,
            response_serializer=generation__pb2.ForwardResponseProto.SerializeToString,
        ),
    }
    generic_handler = grpc.method_handlers_generic_handler("generation.GenerationService", rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))


# This class is part of an EXPERIMENTAL API.
class GenerationService(object):
    """Missing associated documentation comment in .proto file."""

    @staticmethod
    def Generate(
        request,
        target,
        options=(),
        channel_credentials=None,
        call_credentials=None,
        insecure=False,
        compression=None,
        wait_for_ready=None,
        timeout=None,
        metadata=None,
    ):
        return grpc.experimental.unary_unary(
            request,
            target,
            "/generation.GenerationService/Generate",
            generation__pb2.GenerationRequestProto.SerializeToString,
            generation__pb2.GenerationResponseProto.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
        )

    @staticmethod
    def Forward(
        request,
        target,
        options=(),
        channel_credentials=None,
        call_credentials=None,
        insecure=False,
        compression=None,
        wait_for_ready=None,
        timeout=None,
        metadata=None,
    ):
        return grpc.experimental.unary_unary(
            request,
            target,
            "/generation.GenerationService/Forward",
            generation__pb2.ForwardRequestProto.SerializeToString,
            generation__pb2.ForwardResponseProto.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
        )


================================================
FILE: inference_server/model_handler/grpc_utils/proto/generation.proto
================================================
syntax = "proto3";
package generation;

service GenerationService {
    rpc Generate (GenerationRequestProto) returns (GenerationResponseProto) {}
    rpc Forward (ForwardRequestProto) returns (ForwardResponseProto) {}
}

message Value {
    oneof oneof_values {
        string svalue = 1;
        int64 ivalue = 2;
        float fvalue = 3;
        bool bvalue = 4;
    }
}

message GenerationRequestProto {
    repeated string texts = 1;
    map<string,Value> generate_kwargs = 2;
}

message GenerationResponseProto {
    repeated string texts = 1;
    repeated int32 num_generated_tokens = 2;
    string error = 3;
    bool is_encoder_decoder = 4;
}

message ForwardRequestProto {
    repeated string conditioning_text = 1;
    repeated string response = 2;
}

message ForwardResponseProto {
    float nll = 1;
    string error = 2;
    bool is_encoder_decoder = 3;
}


================================================
FILE: inference_server/model_handler/launch.py
================================================
"""
Copyright 2022 The Microsoft DeepSpeed Team
"""
import argparse

import torch.distributed as dist

from ..models import get_model_class, start_inference_engine
from ..utils import get_argument_parser, parse_args
from .grpc_utils.generation_server import serve


def get_args() -> argparse.Namespace:
    parser = get_argument_parser()

    group = parser.add_argument_group(title="launch config")
    group.add_argument("--local_rank", required=False, type=int, help="used by dist launchers")
    group.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload for DS ZeRO")
    group.add_argument("--ports", nargs="+", help="GRPC ports")

    args = parse_args(parser)

    return args


def main():
    args = get_args()
    start_inference_engine(args.deployment_framework)
    model = get_model_class(args.deployment_framework)(args)
    serve(model, args.ports[dist.get_rank()])


if __name__ == "__main__":
    main()


================================================
FILE: inference_server/models/__init__.py
================================================
from ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE, HF_CPU
from .model import Model, get_hf_model_class, load_tokenizer


def get_model_class(deployment_framework: str):
    if deployment_framework == HF_ACCELERATE:
        from .hf_accelerate import HFAccelerateModel

        return HFAccelerateModel
    elif deployment_framework == HF_CPU:
        from .hf_cpu import HFCPUModel

        return HFCPUModel
    elif deployment_framework == DS_INFERENCE:
        from .ds_inference import DSInferenceModel

        return DSInferenceModel
    elif deployment_framework == DS_ZERO:
        from .ds_zero import DSZeROModel

        return DSZeROModel
    else:
        raise ValueError(f"Unknown deployment framework {deployment_framework}")


def start_inference_engine(deployment_framework: str) -> None:
    if deployment_framework in [DS_INFERENCE, DS_ZERO]:
        import deepspeed

        deepspeed.init_distributed("nccl")


================================================
FILE: inference_server/models/ds_inference.py
================================================
import glob
import io
import json
import os
from argparse import Namespace
from functools import partial

import torch

import deepspeed
from huggingface_hub import try_to_load_from_cache
from transformers import AutoConfig

from ..utils import get_world_size, run_rank_n
from .model import Model, get_hf_model_class


# basic DeepSpeed inference model class for benchmarking
class DSInferenceModel(Model):
    def __init__(self, args: Namespace) -> None:
        super().__init__(args)

        # create dummy tensors for allocating space which will be filled with
        # the actual weights while calling deepspeed.init_inference in the
        # following code
        with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
            self.model = get_hf_model_class(args.model_class).from_config(
                AutoConfig.from_pretrained(args.model_name), torch_dtype=torch.bfloat16
            )
        self.model = self.model.eval()

        downloaded_model_path = get_model_path(args.model_name)

        if args.dtype in [torch.float16, torch.int8]:
            # We currently support the weights provided by microsoft (which are
            # pre-sharded)
            checkpoints_json = os.path.join(downloaded_model_path, "ds_inference_config.json")

            if os.path.isfile(checkpoints_json):
                self.model = deepspeed.init_inference(
                    self.model,
                    mp_size=get_world_size(),
                    base_dir=downloaded_model_path,
                    dtype=args.dtype,
                    checkpoint=checkpoints_json,
                    replace_with_kernel_inject=True,
                )
            else:
                # for bigscience/bloom, sharding is done while loading the model
                # so this is much slower and for this we need to create a
                # checkpoints json
                with TemporaryCheckpointsJSON(downloaded_model_path) as checkpoints_json:
                    self.model = deepspeed.init_inference(
                        self.model,
                        mp_size=get_world_size(),
                        base_dir=downloaded_model_path,
                        dtype=args.dtype,
                        checkpoint=checkpoints_json,
                        replace_with_kernel_inject=True,
                    )
        elif args.dtype == torch.bfloat16:
            # currently ds-inference only supports fp16 CUDA kernels :(
            raise NotImplementedError("bfloat16 is not yet supported")

        self.model = self.model.module
        self.input_device = torch.cuda.current_device()

        self.post_init(args.model_name)


class TemporaryCheckpointsJSON:
    def __init__(self, model_path: str):
        self.tmp_directory = "tmp"
        self.tmp_file = os.path.join(self.tmp_directory, "checkpoints.json")
        self.model_path = model_path

    def write_checkpoints_json(self) -> None:
        print(self.model_path)
        with io.open(self.tmp_file, "w", encoding="utf-8") as f:
            data = {"type": "BLOOM", "checkpoints": glob.glob(f"{self.model_path}/*.bin"), "version": 1.0}
            json.dump(data, f)

    def __enter__(self):
        run_rank_n(os.makedirs, barrier=True)(self.tmp_directory, exist_ok=True)
        run_rank_n(self.write_checkpoints_json, barrier=True)()
        return self.tmp_file

    def __exit__(self, type, value, traceback):
        return


def get_model_path(model_name: str):
    try:
        config_file = "config.json"

        # will fall back to HUGGINGFACE_HUB_CACHE
        config_path = try_to_load_from_cache(model_name, config_file, cache_dir=os.getenv("TRANSFORMERS_CACHE"))

        if config_path is None:
            # treat the model name as an explicit model path
            return model_name
        else:
            return os.path.dirname(config_path)
    except:
        # treat the model name as an explicit model path
        return model_name


================================================
FILE: inference_server/models/ds_zero.py
================================================
from argparse import Namespace

import torch

import deepspeed
from transformers import AutoConfig
from transformers.deepspeed import HfDeepSpeedConfig

from ..utils import get_world_size
from .model import Model, get_hf_model_class


class DSZeROModel(Model):
    def __init__(self, args: Namespace) -> None:
        super().__init__(args)

        config = AutoConfig.from_pretrained(args.model_name)

        train_micro_batch_size_per_gpu = 1
        train_batch_size = train_micro_batch_size_per_gpu * get_world_size()

        # try playing with these parameters, might improve throughput for you
        # hardware setup
        ds_config = {
            "fp16": {
                "enabled": args.dtype == torch.float16,
            },
            "bf16": {
                "enabled": args.dtype == torch.bfloat16,
            },
            "zero_optimization": {
                "stage": 3,
                "overlap_comm": True,
                "contiguous_gradients": True,
                "reduce_bucket_size": config.hidden_size * config.hidden_size,
                "stage3_prefetch_bucket_size": 0.9 * config.hidden_size * config.hidden_size,
                "stage3_param_persistence_threshold": 0,
            },
            "steps_per_print": 2000,
            "train_batch_size": train_batch_size,
            "train_micro_batch_size_per_gpu": train_micro_batch_size_per_gpu,
            "wall_clock_breakdown": False,
        }

        if args.cpu_offload:
            ds_config["zero_optimization"]["offload_param"] = {"device": "cpu", "pin_memory": True}

        # this tells from_pretrained to instantiate directly on gpus
        dschf = HfDeepSpeedConfig(ds_config)

        self.model = get_hf_model_class(args.model_class).from_pretrained(args.model_name, torch_dtype=args.dtype)
        self.model = self.model.eval()

        # convert model to a fully sharded model using ZeRO
        self.model = deepspeed.initialize(model=self.model, config_params=ds_config)[0]

        self.model.module.eval()
        self.model = self.model.module

        # this is the CUDA device for the current process. This will be used
        # later to identify the GPU on which to transfer tensors
        self.input_device = torch.cuda.current_device()

        self.post_init(args.model_name)


================================================
FILE: inference_server/models/hf_accelerate.py
================================================
from argparse import Namespace

import torch

from ..utils import get_world_size
from .model import Model, get_hf_model_class


class HFAccelerateModel(Model):
    def __init__(self, args: Namespace) -> None:
        super().__init__(args)

        kwargs = {"pretrained_model_name_or_path": args.model_name, "device_map": "auto"}

        if get_world_size() > 1:
            kwargs["device_map"] = "balanced_low_0"

        if args.dtype == torch.int8:
            # using LLM.int8()
            kwargs["load_in_8bit"] = True
        else:
            kwargs["torch_dtype"] = args.dtype

        # this is the CUDA device for the current process. This will be used
        # later to identify the GPU on which to transfer tensors
        self.model = get_hf_model_class(args.model_class).from_pretrained(**kwargs)

        self.model.requires_grad_(False)
        self.model.eval()
        self.input_device = "cuda:0"

        self.post_init(args.model_name)


================================================
FILE: inference_server/models/hf_cpu.py
================================================
from argparse import Namespace

from .hf_accelerate import HFAccelerateModel


class HFCPUModel(HFAccelerateModel):
    def __init__(self, args: Namespace) -> None:
        super().__init__(args)
        self.input_device = "cpu"


================================================
FILE: inference_server/models/model.py
================================================
import argparse
import copy
from typing import List, Union

import torch

import transformers
from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

from ..utils import (
    ForwardRequest,
    ForwardResponse,
    GenerateRequest,
    GenerateResponse,
    TokenizeRequest,
    TokenizeResponse,
)


class Model:
    def __init__(self, args: argparse.Namespace) -> None:
        self.model = None
        self.input_device = None
        self.max_input_length = args.max_input_length
        self.max_batch_size = args.max_batch_size

    def post_init(self, model_name: str) -> None:
        self.is_encoder_decoder = AutoConfig.from_pretrained(model_name).is_encoder_decoder
        self.generation_config = GenerationConfig.from_model_config(AutoConfig.from_pretrained(model_name))
        self.tokenizer = load_tokenizer(model_name)
        self.pad = self.tokenizer.pad_token_id
        self.prefix_token_id = self.tokenizer("A")["input_ids"][0]

    def get_generation_config(self, request: GenerateRequest) -> GenerationConfig:
        generation_config = copy.deepcopy(self.generation_config)
        request = dict(request)

        request_filtered = {}
        for key, value in request.items():
            if value is not None and key not in ["text", "remove_input_from_output"]:
                request_filtered[key] = value
        request_filtered["return_dict_in_generate"] = True

        generation_config.update(**request_filtered)
        return generation_config

    def generate(self, request: GenerateRequest) -> Union[GenerateResponse, Exception]:
        try:
            batch_size = len(request.text)

            check_batch_size(batch_size, self.max_batch_size)

            input_tokens = self.tokenizer(request.text, return_tensors="pt", padding=True)
            max_input_length_in_batch = input_tokens.input_ids[0].shape[0]

            check_max_input_length(max_input_length_in_batch, self.max_input_length)

            for t in input_tokens:
                if torch.is_tensor(input_tokens[t]):
                    input_tokens[t] = input_tokens[t].to(self.input_device)

            num_input_tokens = input_tokens["input_ids"].shape[1]

            generation_config = self.get_generation_config(request)

            output = self.model.generate(**input_tokens, generation_config=generation_config)

            output_tokens = output.sequences

            if self.is_encoder_decoder:
                num_generated_tokens = (output_tokens != self.pad).sum(dim=-1).tolist()
                generated_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
            else:
                generated_tokens = output_tokens[:, num_input_tokens:]
                num_generated_tokens = (generated_tokens != self.pad).sum(dim=-1).tolist()

                if request.remove_input_from_output:
                    # create the dummy prefix for detokenization
                    prefix_to_add = torch.tensor([[self.prefix_token_id]] * batch_size).to(self.input_device)
                    # the generate method's output includes input too. Remove input if
                    # that is requested by the user
                    generated_tokens = torch.cat([prefix_to_add, generated_tokens], dim=1)
                    generated_text = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
                    generated_text = [i[1:] for i in generated_text]
                else:
                    generated_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True)

            return GenerateResponse(
                text=generated_text,
                num_generated_tokens=num_generated_tokens,
                is_encoder_decoder=self.is_encoder_decoder,
            )
        except Exception as exception:
            return exception

    def forward(self, request: ForwardRequest) -> Union[ForwardResponse, Exception]:
        def prepare_tensors(conditioning_tokens: List[List[int]], response_tokens: List[List[int]]):
            bs = len(conditioning_tokens)

            input_ids = [conditioning_tokens[i] + response_tokens[i] for i in range(bs)]
            attention_mask = [[1] * (len(conditioning_tokens[i]) + len(response_tokens[i])) for i in range(bs)]
            labels = [[-100] * len(conditioning_tokens[i]) + response_tokens[i] for i in range(bs)]

            input_ids = pad(input_ids, self.tokenizer.pad_token_id)
            attention_mask = pad(attention_mask, 0)
            labels = pad(labels, -100)

            return {
                "input_ids": torch.tensor(input_ids),
                "attention_mask": torch.tensor(attention_mask),
                "labels": torch.tensor(labels),
            }

        def pad(arrays: list, padding: int, max_length: int = None):
            if max_length is None:
                max_length = max(list(map(len, arrays)))

            arrays = [[padding] * (max_length - len(array)) + array for array in arrays]
            return arrays

        try:
            batch_size = len(request.conditioning_text)

            check_batch_size(batch_size, self.max_batch_size)

            conditioning_tokens = self.tokenizer(request.conditioning_text)["input_ids"]
            response_tokens = self.tokenizer(request.response)["input_ids"]

            max_length_in_batch = max([len(conditioning_tokens) + len(response_tokens)])
            check_max_input_length(max_length_in_batch, self.max_input_length)

            input_tokens = prepare_tensors(conditioning_tokens, response_tokens)

            for t in input_tokens:
                if torch.is_tensor(input_tokens[t]):
                    input_tokens[t] = input_tokens[t].to(self.input_device)

            loss = self.model(**input_tokens).loss

            return ForwardResponse(nll=loss.item(), is_encoder_decoder=self.is_encoder_decoder)
        except Exception as exception:
            return exception

    def tokenize(self, request: TokenizeRequest) -> TokenizeResponse:
        return TokenizeResponse(
            token_ids=self.tokenizer(request.text).input_ids,
            is_encoder_decoder=self.is_encoder_decoder,
        )


def check_max_input_length(input_token_length: int, max_input_length: int) -> None:
    if max_input_length is None:
        return

    if input_token_length > max_input_length:
        raise Exception(f"max supported input length = {max_input_length} for now")


def check_batch_size(batch_size: int, max_batch_size: int) -> None:
    if max_batch_size is None:
        return

    if batch_size > max_batch_size:
        raise Exception(f"max supported batch size = {max_batch_size} for now")


# this is a hack for now
def get_hf_model_class(model_class: str) -> Union[AutoModelForCausalLM, AutoModelForSeq2SeqLM]:
    return getattr(transformers, model_class)


def load_tokenizer(model_name: str) -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")

    if tokenizer.pad_token_id is None:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})

    return tokenizer


================================================
FILE: inference_server/server.py
================================================
import os
from functools import partial

from flask import Flask, request
from flask_api import status
from pydantic import BaseModel

from .constants import HF_ACCELERATE
from .model_handler.deployment import ModelDeployment
from .utils import (
    ForwardRequest,
    GenerateRequest,
    TokenizeRequest,
    get_exception_response,
    get_num_tokens_to_generate,
    get_torch_dtype,
    parse_bool,
    run_and_log_time,
)


class QueryID(BaseModel):
    generate_query_id: int = 0
    tokenize_query_id: int = 0
    forward_query_id: int = 0


# placeholder class for getting args. gunicorn does not allow passing args to a
# python script via ArgumentParser
class Args:
    def __init__(self) -> None:
        self.deployment_framework = os.getenv("DEPLOYMENT_FRAMEWORK", HF_ACCELERATE)
        self.model_name = os.getenv("MODEL_NAME")
        self.model_class = os.getenv("MODEL_CLASS")
        self.dtype = get_torch_dtype(os.getenv("DTYPE"))
        self.allowed_max_new_tokens = int(os.getenv("ALLOWED_MAX_NEW_TOKENS", 100))
        self.max_input_length = int(os.getenv("MAX_INPUT_LENGTH", 512))
        self.max_batch_size = int(os.getenv("MAX_BATCH_SIZE", 4))
        self.debug = parse_bool(os.getenv("DEBUG", "false"))


# ------------------------------------------------------
args = Args()
model = ModelDeployment(args, True)
query_ids = QueryID()
app = Flask(__name__)
# ------------------------------------------------------


@app.route("/query_id/", methods=["GET"])
def query_id():
    return query_ids.dict(), status.HTTP_200_OK


@app.route("/tokenize/", methods=["POST"])
def tokenize():
    try:
        x = request.get_json()
        x = TokenizeRequest(**x)

        response, total_time_taken = run_and_log_time(partial(model.tokenize, request=x))

        response.query_id = query_ids.tokenize_query_id
        query_ids.tokenize_query_id += 1
        response.total_time_taken = "{:.2f} msecs".format(total_time_taken * 1000)

        return response.dict(), status.HTTP_200_OK
    except Exception:
        response = get_exception_response(query_ids.tokenize_query_id, args.debug)
        query_ids.tokenize_query_id += 1
        return response, status.HTTP_500_INTERNAL_SERVER_ERROR


@app.route("/generate/", methods=["POST"])
def generate():
    try:
        x = request.get_json()
        x = GenerateRequest(**x)

        x.max_new_tokens = get_num_tokens_to_generate(x.max_new_tokens, args.allowed_max_new_tokens)

        response, total_time_taken = run_and_log_time(partial(model.generate, request=x))

        response.query_id = query_ids.generate_query_id
        query_ids.generate_query_id += 1
        response.total_time_taken = "{:.2f} secs".format(total_time_taken)

        return response.dict(), status.HTTP_200_OK
    except Exception:
        response = get_exception_response(query_ids.generate_query_id, args.debug)
        query_ids.generate_query_id += 1
        return response, status.HTTP_500_INTERNAL_SERVER_ERROR


@app.route("/forward/", methods=["POST"])
def forward():
    try:
        x = request.get_json()
        x = ForwardRequest(**x)

        if len(x.conditioning_text) != len(x.response):
            raise Exception("unequal number of elements in conditioning_text and response arguments")

        response, total_time_taken = run_and_log_time(partial(model.forward, request=x))

        response.query_id = query_ids.forward_query_id
        query_ids.forward_query_id += 1
        response.total_time_taken = "{:.2f} secs".format(total_time_taken)

        return response.dict(), status.HTTP_200_OK
    except Exception:
        response = get_exception_response(query_ids.forward_query_id, args.debug)
        query_ids.forward_query_id += 1
        return response, status.HTTP_500_INTERNAL_SERVER_ERROR


================================================
FILE: inference_server/utils/__init__.py
================================================
from .requests import (
    ForwardRequest,
    ForwardResponse,
    GenerateRequest,
    GenerateResponse,
    TokenizeRequest,
    TokenizeResponse,
    create_generate_request,
    get_filter_dict,
    parse_bool,
)
from .utils import (
    get_argument_parser,
    get_cuda_visible_devices,
    get_dummy_batch,
    get_exception_response,
    get_num_tokens_to_generate,
    get_str_dtype,
    get_torch_dtype,
    get_world_size,
    pad_ids,
    parse_args,
    print_rank_0,
    run_and_log_time,
    run_rank_n,
)


================================================
FILE: inference_server/utils/requests.py
================================================
from typing import Any, List

from pydantic import BaseModel


class BaseResponse(BaseModel):
    query_id: int = None
    total_time_taken: str = None


class GenerateRequest(BaseModel):
    text: List[str] = None
    min_length: int = None
    do_sample: bool = None
    early_stopping: bool = None
    temperature: float = None
    top_k: int = None
    top_p: float = None
    typical_p: float = None
    repetition_penalty: float = None
    bos_token_id: int = None
    pad_token_id: int = None
    eos_token_id: int = None
    length_penalty: float = None
    no_repeat_ngram_size: int = None
    encoder_no_repeat_ngram_size: int = None
    max_time: float = None
    max_new_tokens: int = None
    decoder_start_token_id: int = None
    diversity_penalty: float = None
    forced_bos_token_id: int = None
    forced_eos_token_id: int = None
    exponential_decay_length_penalty: float = None
    remove_input_from_output: bool = True

    def get_generate_kwargs(self) -> dict:
        x = {}
        for k, v in self.dict().items():
            if k not in ["text", "method"] and v is not None:
                x[k] = v
        return x


class GenerateResponse(BaseResponse):
    text: List[str] = None
    num_generated_tokens: List[int] = None
    is_encoder_decoder: bool = False


class TokenizeRequest(BaseModel):
    text: List[str] = None


class TokenizeResponse(BaseResponse):
    token_ids: List[List[int]] = None
    is_encoder_decoder: bool = False


class ForwardRequest(BaseModel):
    conditioning_text: List[str] = None
    response: List[str] = None


class ForwardResponse(BaseResponse):
    nll: float = None
    is_encoder_decoder: bool = False


def parse_bool(value: str) -> bool:
    if value.lower() == "true":
        return True
    elif value.lower() == "false":
        return False
    else:
        raise ValueError("{} is not a valid boolean value".format(value))


def parse_field(kwargs: dict, field: str, dtype: type, default_value: Any = None) -> Any:
    if field in kwargs:
        if type(kwargs[field]) == dtype:
            return kwargs[field]
        elif dtype == bool:
            return parse_bool(kwargs[field])
        else:
            return dtype(kwargs[field])
    else:
        return default_value


def create_generate_request(text: List[str], generate_kwargs: dict) -> GenerateRequest:
    # get user generate_kwargs as json and parse it
    return GenerateRequest(
        text=text,
        min_length=parse_field(generate_kwargs, "min_length", int),
        do_sample=parse_field(generate_kwargs, "do_sample", bool),
        early_stopping=parse_field(generate_kwargs, "early_stopping", bool),
        temperature=parse_field(generate_kwargs, "temperature", float),
        top_k=parse_field(generate_kwargs, "top_k", int),
        top_p=parse_field(generate_kwargs, "top_p", float),
        typical_p=parse_field(generate_kwargs, "typical_p", float),
        repetition_penalty=parse_field(generate_kwargs, "repetition_penalty", float),
        bos_token_id=parse_field(generate_kwargs, "bos_token_id", int),
        pad_token_id=parse_field(generate_kwargs, "pad_token_id", int),
        eos_token_id=parse_field(generate_kwargs, "eos_token_id", int),
        length_penalty=parse_field(generate_kwargs, "length_penalty", float),
        no_repeat_ngram_size=parse_field(generate_kwargs, "no_repeat_ngram_size", int),
        encoder_no_repeat_ngram_size=parse_field(generate_kwargs, "encoder_no_repeat_ngram_size", int),
        max_time=parse_field(generate_kwargs, "max_time", float),
        max_new_tokens=parse_field(generate_kwargs, "max_new_tokens", int),
        decoder_start_token_id=parse_field(generate_kwargs, "decoder_start_token_id", int),
        diversity_penalty=parse_field(generate_kwargs, "diversity_penalty", float),
        forced_bos_token_id=parse_field(generate_kwargs, "forced_bos_token_id", int),
        forced_eos_token_id=parse_field(generate_kwargs, "forced_eos_token_id", int),
        exponential_decay_length_penalty=parse_field(generate_kwargs, "exponential_decay_length_penalty", float),
        remove_input_from_output=parse_field(generate_kwargs, "remove_input_from_output", bool, True),
    )


def get_filter_dict(d: BaseModel) -> dict:
    d = dict(d)
    q = {}
    for i in d:
        if d[i] != None:
            q[i] = d[i]
    del q["text"]
    return q


================================================
FILE: inference_server/utils/utils.py
================================================
import argparse
import copy
import json
import math
import os
import sys
import time
import traceback
from functools import partial
from typing import Any, Callable, List, Tuple, Union

import torch
import torch.distributed as dist

from ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE, HF_CPU


# used for benchmarks
dummy_input_sentences = [
    "DeepSpeed is a machine learning framework",
    "He is working on",
    "He has a",
    "He got all",
    "Everyone is happy and I can",
    "The new movie that got Oscar this year",
    "In the far far distance from our galaxy,",
    "Peace is the only way",
]


def get_argument_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser()

    group = parser.add_argument_group(title="model")
    group.add_argument(
        "--deployment_framework",
        type=str,
        choices=[HF_ACCELERATE, DS_INFERENCE, DS_ZERO, HF_CPU],
        default=HF_ACCELERATE,
    )
    group.add_argument(
        "--model_name",
        type=str,
        required=True,
        help="model name to use",
    )
    group.add_argument(
        "--model_class",
        type=str,
        required=True,
        help="model class to use",
    )
    group.add_argument(
        "--dtype", type=str, required=True, choices=["bf16", "fp16", "int8", "fp32"], help="dtype for model"
    )
    group.add_argument(
        "--generate_kwargs",
        type=str,
        default='{"min_length": 100, "max_new_tokens": 100, "do_sample": false}',
        help="generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters",
    )
    group.add_argument("--max_input_length", type=int, help="max input length")
    group.add_argument("--max_batch_size", type=int, help="max supported batch size")

    return parser


def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
    args = parser.parse_args()

    args.dtype = get_torch_dtype(args.dtype)
    args.generate_kwargs = json.loads(args.generate_kwargs)

    return args


def run_rank_n(func: Callable, rank: int = 0, barrier: bool = False) -> None:
    # wrapper function for the rank to execute on
    def func_rank_n(*args, **kwargs):
        output = func(*args, **kwargs)
        if barrier:
            dist.barrier()
        return output

    # a dummy method that doesn't do anything
    def func_rank_other(*args, **kwargs):
        if barrier:
            dist.barrier()

    if dist.is_initialized():
        if dist.get_rank() == rank:
            return func_rank_n
        return func_rank_other
    else:
        return func


@run_rank_n
def print_rank_0(*args, **kwargs) -> None:
    print(*args, **kwargs)


def get_torch_dtype(dtype_str: str) -> torch.dtype:
    if dtype_str == "bf16":
        return torch.bfloat16
    elif dtype_str == "fp16":
        return torch.float16
    elif dtype_str == "int8":
        return torch.int8
    elif dtype_str == "fp32":
        return torch.float32


def get_str_dtype(dtype_str: torch.dtype) -> str:
    if dtype_str == torch.bfloat16:
        return "bf16"
    elif dtype_str == torch.float16:
        return "fp16"
    elif dtype_str == torch.int8:
        return "int8"
    elif dtype_str == torch.float32:
        return "fp32"


def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]:
    if input_sentences is None:
        input_sentences = copy.deepcopy(dummy_input_sentences)

    if batch_size > len(input_sentences):
        input_sentences *= math.ceil(batch_size / len(input_sentences))
    input_sentences = input_sentences[:batch_size]

    return input_sentences


def get_num_tokens_to_generate(max_new_tokens: int, allowed_max_new_tokens: int) -> int:
    if max_new_tokens is None:
        return allowed_max_new_tokens
    else:
        return min(max_new_tokens, allowed_max_new_tokens)


def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]:
    # runs a function / list of functions and times them
    start_time = time.time()

    if type(execs) == list:
        results = []
        for f in execs:
            results.append(f())
    else:
        results = execs()

    time_elapsed = time.time() - start_time
    return results, time_elapsed


def pad_ids(arrays, padding, max_length=-1):
    # does left padding
    if max_length < 0:
        max_length = max(list(map(len, arrays)))

    arrays = [[padding] * (max_length - len(array)) + array for array in arrays]

    return arrays


def get_exception_response(query_id: int, debug: bool = False):
    e_type, e_message, e_stack_trace = sys.exc_info()
    response = {"error": str(e_type.__name__), "message": str(e_message), "query_id": query_id}

    if debug:
        trace_back = traceback.extract_tb(e_stack_trace)

        # Format stacktrace
        stack_trace = []
        for trace in trace_back:
            stack_trace.append(
                "File : {}, Line : {}, Func.Name : {}, Message : {}".format(trace[0], trace[1], trace[2], trace[3])
            )

        response["stack_trace"] = stack_trace

    return response


def get_world_size() -> int:
    if dist.is_initialized():
        return dist.get_world_size()
    else:
        cuda_visible_devices = get_cuda_visible_devices()
        if cuda_visible_devices is None:
            return 0
        return len(cuda_visible_devices)


def get_cuda_visible_devices() -> List[int]:
    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
    if cuda_visible_devices is not None:
        cuda_visible_devices = list(map(int, cuda_visible_devices.split(",")))
    return cuda_visible_devices


================================================
FILE: server_request.py
================================================
import argparse

import requests


def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()

    group = parser.add_argument_group(title="launch config")
    group.add_argument("--host", type=str, required=True, help="host address")
    group.add_argument("--port", type=int, required=True, help="port number")

    return parser.parse_args()


def generate(url: str) -> None:
    url = url + "/generate/"

    request_body = {
        "text": [
            "DeepSpeed",
            "DeepSpeed is a",
            "DeepSpeed is a machine",
            "DeepSpeed is a machine learning framework",
        ],
        "max_new_tokens": 40,
    }
    response = requests.post(url=url, json=request_body, verify=False)
    print(response.json(), "\n")


def tokenize(url: str) -> None:
    url = url + "/tokenize/"

    request_body = {"text": ["DeepSpeed is a", "DeepSpeed is a machine learning framework"]}
    response = requests.post(url=url, json=request_body, verify=False)
    print(response.json(), "\n")


def forward(url: str) -> None:
    url = url + "/forward/"

    request_body = {
        "conditioning_text": [
            "DeepSpeed",
            "DeepSpeed is a",
            "DeepSpeed is a machine",
            "DeepSpeed is a machine learning framework",
        ],
        "response": [
            "DeepSpeed",
            "DeepSpeed is a",
            "DeepSpeed is a machine",
            "DeepSpeed is a machine learning framework",
        ],
    }
    response = requests.post(url=url, json=request_body, verify=False)
    print(response.json(), "\n")


def query_id(url: str) -> None:
    url = url + "/query_id/"

    response = requests.get(url=url, verify=False)
    print(response.json(), "\n")


def main():
    args = get_args()
    url = "http://{}:{}".format(args.host, args.port)

    generate(url)
    tokenize(url)
    forward(url)
    query_id(url)


if __name__ == "__main__":
    main()


================================================
FILE: setup.cfg
================================================
[isort]
default_section = FIRSTPARTY
ensure_newline_before_comments = True
force_grid_wrap = 0
include_trailing_comma = True
known_first_party = transformers
known_third_party =
    absl
    conllu
    datasets
    elasticsearch
    fairseq
    faiss-cpu
    fastprogress
    fire
    fugashi
    git
    h5py
    matplotlib
    nltk
    numpy
    packaging
    pandas
    PIL
    psutil
    pytest
    pytorch_lightning
    rouge_score
    sacrebleu
    seqeval
    sklearn
    streamlit
    tensorboardX
    tensorflow
    tensorflow_datasets
    timeout_decorator
    torch
    torchaudio
    torchtext
    torchvision
    torch_xla
    tqdm

line_length = 119
lines_after_imports = 2
multi_line_output = 3
use_parentheses = True


================================================
FILE: static/css/style.css
================================================
#left-column {
    width: 80%;
}

#right-column {
    width: 18%;
    float: right;
    padding-right: 10px;
}

body {
    background-color: lightgray;
    height: auto;
}

#text-input {
    width: 100%;
    float: left;
    resize: none;
}

.slider {
    width: 100%;
    float: left;
}

#log-output {
    width: 100%;
    float: left;
    resize: none;
}

#max-new-tokens-input {
    width: 30%;
    float: left;
    margin-left: 5px;
}


================================================
FILE: static/js/index.js
================================================
const textGenInput = document.getElementById('text-input');
const clickButton = document.getElementById('submit-button');

const temperatureSlider = document.getElementById('temperature-slider');
const temperatureTextBox = document.getElementById('temperature-textbox')

const top_pSlider = document.getElementById('top_p-slider');
const top_pTextBox = document.getElementById('top_p-textbox');

const top_kSlider = document.getElementById('top_k-slider');
const top_kTextBox = document.getElementById('top_k-textbox');

const repetition_penaltySlider = document.getElementById('repetition_penalty-slider');
const repetition_penaltyTextBox = document.getElementById('repetition_penalty-textbox');

const max_new_tokensInput = document.getElementById('max-new-tokens-input');

const textLogOutput = document.getElementById('log-output');

function get_temperature() {
    return parseFloat(temperatureSlider.value);
}

temperatureSlider.addEventListener('input', async (event) => {
    temperatureTextBox.innerHTML = "temperature = " + get_temperature();
});

function get_top_p() {
    return parseFloat(top_pSlider.value);
}

top_pSlider.addEventListener('input', async (event) => {
    top_pTextBox.innerHTML = "top_p = " + get_top_p();
});

function get_top_k() {
    return parseInt(top_kSlider.value);
}

top_kSlider.addEventListener('input', async (event) => {
    top_kTextBox.innerHTML = "top_k = " + get_top_k();
});

function get_repetition_penalty() {
    return parseFloat(repetition_penaltySlider.value);
}

repetition_penaltySlider.addEventListener('input', async (event) => {
    repetition_penaltyTextBox.innerHTML = "repetition_penalty = " + get_repetition_penalty();
});

function get_max_new_tokens() {
    return parseInt(max_new_tokensInput.value);
}

clickButton.addEventListener('click', async (event) => {
    clickButton.textContent = 'Processing'
    clickButton.disabled = true;

    var jsonPayload = {
        text: [textGenInput.value],
        temperature: get_temperature(),
        top_k: get_top_k(),
        top_p: get_top_p(),
        max_new_tokens: get_max_new_tokens(),
        repetition_penalty: get_repetition_penalty(),
        do_sample: true,
        remove_input_from_output: true
    };

    if (jsonPayload.temperature == 0) {
        jsonPayload.do_sample = false;
    }

    console.log(jsonPayload);

    $.ajax({
        url: '/generate/',
        type: 'POST',
        contentType: "application/json; charset=utf-8",
        data: JSON.stringify(jsonPayload),
        headers: { 'Access-Control-Allow-Origin': '*' },
        success: function (response) {
            var input_text = textGenInput.value;

            if ("text" in response) {
                if (response.is_encoder_decoder) {
                    textLogOutput.value = response.text[0] + '\n\n';
                } else {
                    textGenInput.value = input_text + response.text[0];
                    textLogOutput.value = '';
                }

                textLogOutput.value += 'total_time_taken = ' + response.total_time_taken + "\n";
                textLogOutput.value += 'num_generated_tokens = ' + response.num_generated_tokens + "\n";
                textLogOutput.style.backgroundColor = "lightblue";
            } else {
                textLogOutput.value = 'total_time_taken = ' + response.total_time_taken + "\n";
                textLogOutput.value += 'error: ' + response.message;
                textLogOutput.style.backgroundColor = "#D65235";
            }

            clickButton.textContent = 'Submit';
            clickButton.disabled = false;
        },
        error: function (error) {
            console.log(JSON.stringify(error, null, 2));
            clickButton.textContent = 'Submit'
            clickButton.disabled = false;
        }
    });
});


================================================
FILE: templates/index.html
================================================
<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Large Models Playground</title>
    <link href="{{ url_for('static', path='css/style.css') }}" rel="stylesheet">
    <script type="module" src="{{ url_for('static', path='js/index.js') }}"></script>
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
</head>

<body>
    <div id="left-column">
        <textarea placeholder="Input Prompt" id="text-input" style="color: black; background-color: white;"
            rows="47"></textarea>
    </div>

    <div id="right-column">
        <div>
            <textbox id="temperature-textbox">
                temperature = 1
            </textbox>
            <input type="range" min="0" max="1" value="1" step="0.01" class="slider" id="temperature-slider">
        </div>

        <div>
            <textbox id="top_k-textbox">
                top_k = 50
            </textbox>
            <input type="range" min="1" max="100" value="50" class="slider" id="top_k-slider">
        </div>

        <div>
            <textbox id="top_p-textbox">
                top_p = 1
            </textbox>
            <input type="range" min="0" max="1" step="0.01" value="1" class="slider" id="top_p-slider">
        </div>

        <div>
            <textbox style="float: left;">
                max_new_tokens =
            </textbox>
            <input type="text" value="40" id="max-new-tokens-input">
        </div>

        <div>
            <textbox id="repetition_penalty-textbox">
                repetition_penalty = 1
            </textbox>
            <input type="range" min="1" max="3" step="0.01" value="1" class="slider" id="repetition_penalty-slider">
        </div>

        <button id="submit-button" style="margin-top: 10px;">Submit</button>

        <div style="margin-top: 10px;">
            <textarea id="log-output" rows="40" style="color: black; background-color: lightblue;" readonly></textarea>
        </div>
    </div>
</body>

</html>


================================================
FILE: ui.py
================================================
import argparse

import requests
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.routing import APIRoute, Mount
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from transformers import AutoTokenizer
from uvicorn import run


def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()

    group = parser.add_argument_group(title="launch config")
    group.add_argument("--ui_host", type=str, default="127.0.0.1", help="host address for UI")
    group.add_argument("--ui_port", type=int, default=5001, help="port number for UI")
    group.add_argument(
        "--generation_backend_host", type=str, default="127.0.0.1", help="host address for generation server"
    )
    group.add_argument("--generation_backend_port", type=int, default=5000, help="port number for generation server")

    return parser.parse_args()


class Server:
    def __init__(self, args: argparse.Namespace):
        self.templates = Jinja2Templates(directory="templates")
        self.ui_host = args.ui_host
        self.ui_port = args.ui_port
        self.generation_backend_host = args.generation_backend_host
        self.generation_backend_port = args.generation_backend_port
        self.workers = 1

        self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")

        self.app = FastAPI(
            routes=[
                APIRoute("/", self.homepage, methods=["GET"], response_class=HTMLResponse),
                APIRoute("/generate/", self.generate, methods=["POST"]),
                Mount("/static/", StaticFiles(directory="static"), name="static"),
            ],
            timeout=600,
        )

        self.prefix_checkpoints_list = None

    def homepage(self, request: Request) -> HTMLResponse:
        return self.templates.TemplateResponse("index.html", {"request": request})

    def generate(self, request: dict) -> JSONResponse:
        response = requests.post(
            f"http://{self.generation_backend_host}:{self.generation_backend_port}/generate",
            json=request,
            verify=False,
        )
        return JSONResponse(content=response.json())

    def run(self):
        # get around CORS
        self.app.add_middleware(
            CORSMiddleware,
            allow_origins=["*"],
            allow_credentials=True,
            allow_methods=["*"],
            allow_headers=["*"],
        )

        run(self.app, host=self.ui_host, port=self.ui_port, workers=self.workers)


def main() -> None:
    Server(get_args()).run()


if __name__ == "__main__":
    main()