Repository: togethercomputer/OpenChatKit Branch: main Commit: a7094aa583d4 Files: 83 Total size: 440.4 KB Directory structure: gitextract_ib4k5pq4/ ├── .github/ │ └── ISSUE_TEMPLATE/ │ ├── bug_report.md │ ├── feature_request.md │ └── openchatkit-feedback-report.yaml ├── .gitignore ├── LICENSE ├── README.md ├── data/ │ ├── OIG/ │ │ └── prepare.py │ ├── OIG-chip2/ │ │ └── prepare.sh │ ├── OIG-moderation/ │ │ └── prepare.py │ ├── prepare_data.py │ └── wikipedia-3sentence-level-retrieval-index/ │ └── prepare.py ├── docs/ │ ├── GPT-NeoXT-Chat-Base-20B.md │ └── finetuning-RedPajama-3B.md ├── environment.yml ├── inference/ │ ├── README.md │ ├── bot.py │ └── conversation.py ├── pretrained/ │ ├── GPT-NeoX-20B/ │ │ └── prepare.py │ ├── Llama-2-7B-32K-beta/ │ │ └── prepare.py │ ├── Pythia-6.9B-deduped/ │ │ └── prepare.py │ ├── RedPajama-3B/ │ │ └── prepare.py │ ├── RedPajama-7B/ │ │ └── prepare.py │ └── prepare_pretrained.py ├── retrieval/ │ ├── README.md │ ├── __init__.py │ └── wikipedia.py ├── tools/ │ ├── README.md │ ├── benchmark_input.json │ ├── convert_to_hf_gptneox.py │ ├── convert_to_hf_llama.py │ └── model_load_benchmark.py └── training/ ├── README.md ├── comm/ │ ├── __init__.py │ ├── comm_utils.py │ ├── nccl_backend.py │ └── torch_backend.py ├── data_parallel/ │ ├── __init__.py │ ├── dist_dp_allreduce.py │ ├── dist_dp_central_ps.py │ ├── dist_dp_local.py │ ├── dist_dp_sharded_ps.py │ ├── dist_dp_utils.py │ └── flatten_utils.py ├── dist_clm_train.py ├── dist_prefixlm_train.py ├── finetune_GPT-NeoXT-Chat-Base-20B.sh ├── finetune_Pythia-Chat-Base-7B.sh ├── finetune_RedPajama-INCITE-7B-Chat.sh ├── finetune_RedPajama-INCITE-Chat-3B-v1.sh ├── finetune_llama-2-7b-32k-booksum.sh ├── finetune_llama-2-7b-32k-mqa.sh ├── lora/ │ └── example/ │ ├── redpajama-incite-chat-3b.py │ └── redpajama-incite-chat-3b_inference.py ├── modules/ │ ├── __init__.py │ ├── deberta_modules.py │ ├── dist_deberta_pp_module.py │ ├── dist_gpt_fsdp_module.py │ ├── dist_gpt_pp_module.py │ ├── hf_gpt2_modules.py │ ├── hf_gptj_modules.py │ ├── hf_gptneox_modules.py │ ├── hf_opt_modules.py │ ├── llama_modules.py │ ├── task_modules.py │ ├── tokenizer.py │ └── utils.py ├── optimizer/ │ ├── __init__.py │ ├── grad_scalar.py │ └── optimizer.py ├── pipeline_parallel/ │ ├── __init__.py │ ├── dist_gpipe_pipeline_async.py │ └── dist_pp_utils.py ├── tasks/ │ ├── __init__.py │ └── data_loaders/ │ ├── __init__.py │ ├── data_utils.py │ └── prosocial.py └── utils/ ├── __init__.py ├── dist_args_utils.py ├── dist_checkpoint_utils.py ├── dist_debug_utils.py ├── event_report.py ├── logging_utils.py └── upload_manager.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error **Expected behavior** A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. **Desktop (please complete the following information):** - OS: [e.g. iOS] - Browser [e.g. chrome, safari] - Version [e.g. 22] **Smartphone (please complete the following information):** - Device: [e.g. iPhone6] - OS: [e.g. iOS8.1] - Browser [e.g. stock browser, safari] - Version [e.g. 22] **Additional context** Add any other context about the problem here. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. ================================================ FILE: .github/ISSUE_TEMPLATE/openchatkit-feedback-report.yaml ================================================ name: OpenChatKit Feedback Report description: Details of feedback from using OpenChatKit test app title: OpenChatKit Feedback Report labels: "feedback report" assignees: [] body: - type: markdown attributes: value: | Thanks for taking the time to fill out this feedback report! - type: textarea id: my-question attributes: label: "My question:" validations: required: true - type: textarea id: bot-response attributes: label: "Bot response:" validations: required: true - type: textarea id: ideal-bot-response attributes: label: "Ideal bot response:" validations: required: true - type: checkboxes id: response-issues attributes: label: "Bot response was:" options: - label: Factually incorrect required: true - label: Not helpful required: true - label: Harmful, inappropriate or unsafe required: true ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # ignore downloaded files /data/OIG-moderation/files/ /data/OIG/files/ /data/wikipedia-3sentence-level-retrieval-index/files/ /pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b/ /pretrained/Pythia-6.9B-deduped/EleutherAI_pythia-6.9b-deduped/ /pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1 # ignore training output /model_ckpts/ /huggingface_models/ /training/wandb/ # ignore trained low-rank adapters /outputs/ data/OIG-chip2/*.jsonl wandb/ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ------------- LICENSE for training code ------------- Copyright (c) 2022 Anonymous Institution Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Footer ================================================ FILE: README.md ================================================ # OpenChatKit OpenChatKit provides a powerful, open-source base to create both specialized and general purpose models for various applications. The kit includes an instruction-tuned language models, a moderation model, and an extensible retrieval system for including up-to-date responses from custom repositories. OpenChatKit models were trained on the OIG-43M training dataset, which was a collaboration between [Together](https://www.together.xyz/), [LAION](https://laion.ai), and [Ontocord.ai](https://ontocord.ai). In this repo, you'll find code for: - Training GPT-NeoXT-Chat-Base-20B, a 20B parameter chat model (see [docs/GPT-NeoXT-Chat-Base-20B.md](docs/GPT-NeoXT-Chat-Base-20B.md)) - Fine-tuning Llama-2-7B-32K-beta, a 7B parameter long context model - Training Pythia-Chat-Base-7B, a 7B parameter chat model - Testing inference using either of the chat models - Augmenting the model with additional context from a retrieval index # Contents - [Getting Started](#getting-started) * [Requirements](#requirements) * [Chatting with Pythia-Chat-Base-7B](#chatting-with-pythia-chat-base-7b) - [Fine-tuning Llama-2-7B-32K-beta](#fine-tuning-llama-2-7b-32k-beta) * [Downloading and converting the base model](#downloading-and-converting-the-base-model) * [Fine-tuning the model](#fine-tuning-the-model) * [Converting trained weights to Hugging Face format](#converting-trained-weights-to-hugging-face-format) - [Reproducing Pythia-Chat-Base-7B](#reproducing-pythia-chat-base-7b) * [Downloading training data and the base model](#downloading-training-data-and-the-base-model) * [(Optional) 8bit Adam](#optional-8bit-adam) * [Training the model](#training-the-model) * [Converting weights to Hugging Face format](#converting-weights-to-hugging-face-format) * [Testing the new model](#testing-the-new-model) - [Monitoring](#monitoring) * [Loguru](#loguru) * [Weights & Biases](#weights--biases) - [Experimental: Retrieval-Augmented Models](#experimental-retrieval-augmented-models) - [See Also](#see-also) - [License](#license) - [Citing OpenChatKit](#citing-openchatkit) - [Acknowledgements](#acknowledgements) # Getting Started In this tutorial, you will download Pythia-Chat-Base-7B, an instruction-tuned language model, and run some some inference requests against it using a command-line tool. Pythia-Chat-Base-7B is a 7B-parameter fine-tuned variant of Pythia-6.9B-deduped from Eleuther AI. Pre-trained weights for this model are available on Hugging Face as [togethercomputer/Pythia-Chat-Base-7B](https://huggingface.co/togethercomputer/Pythia-Chat-Base-7B) under an Apache 2.0 license. More details can be found on the model card for [Pythia-Chat-Base-7B](https://huggingface.co/togethercomputer/Pythia-Chat-Base-7B) on Hugging Face. ## Requirements Before you begin, you need to install PyTorch and other dependencies. 1. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) from their website. 2. Install [Git LFS](https://git-lfs.com/) from their website. 3. Install the `git lfs` hooks. ```shell git lfs install ``` 4. Install mamba in the `base` environment so it's available in all environments. ```shell conda install mamba -n base -c conda-forge ``` 5. Create an environment called OpenChatKit using the `environment.yml` file at the root of this repo. > **Note** > Use `mamba` to create the environment. It's **much** faster than using `conda`. ```shell mamba env create -f environment.yml ``` 6. Activate the new conda environment. ```shell conda activate OpenChatKit ``` ## Chatting with Pythia-Chat-Base-7B To help you try the model, [`inference/bot.py`](inference/bot.py) is a simple command-line test harness that provides a shell inferface enabling you to chat with the model. Simply enter text at the prompt and the model replies. The test harness also maintains conversation history to provide the model with context. Start the bot by calling `bot.py` from the root for the repo. ```shell python inference/bot.py --model togethercomputer/Pythia-Chat-Base-7B ``` Loading the model can take some time, but once it's loaded, you are greeted with a prompt. Say hello. ```shell $ python inference/bot.py Loading /home/csris/src/github.com/togethercomputer/OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:1... Welcome to OpenChatKit shell. Type /help or /? to list commands. >>> Hello. Hello human. >>> ``` Enter additional queries at the prompt, and the model replies. Under the covers, the shell is forming a prompt with all previous queries and passes that to the model to generate more text. The shell also supports additional commands to inspect hyperparamters, the full prompt, and more. Commands are prefixed with a `/`. > **Note** > The `/quit` command exits the shell. Please see [the inference README](inference/README.md) for more details about arguments, running on multiple/specific GPUs, and running on consumer hardware. # Fine-tuning Llama-2-7B-32K-beta Llama-2-7B-32K-beta model can be fine-tuned using various datasets. In this tutorial, we will use the multi-document natural questions dataset and BookSum dataset. ## Downloading and converting the base model To download model Llama-2-7B-32K-beta and prepare it for fine-tuning, run this command from the root of the repository. ```shell python pretrained/Llama-2-7B-32K-beta/prepare.py ``` The weights for this model will be in the `pretrained/Llama-2-7B-32K-beta/togethercomputer_Llama-2-7B-32K-beta` directory. ## Fine-tuning the model The `training/finetune_llama-2-7b-32k-mqa.sh` and `training/finetune_llama-2-7b-32k-booksum.sh` scripts configure and run the training loop. 1. To fine-tune the multi-document natural questions dataset, run: ```shell bash training/finetune_llama-2-7b-32k-mqa.sh ``` 2. To fine-tune the BookSum dataset, run: ```shell bash training/finetune_llama-2-7b-32k-booksum.sh ``` As the training loop runs, checkpoints are saved to the `model_ckpts` directory at the root of the repo. Please see [the training README](training/README.md) for more details about customizing the training run. ## Converting trained weights to Hugging Face format Before you can use this model to perform inference, it must be converted to the Hugging Face format. Run this command from the root of the repo to do so. For example ```shell mkdir huggingface_models \ && python tools/convert_to_hf_llama.py \ --config-name togethercomputer/Llama-2-7B-32K-beta \ --ckpt-path model_ckpts/llama-2-7b-32k-mqa/checkpoint_10 \ --save-path huggingface_models/llama-2-7b-32k-mqa \ --n-stages 4 \ --n-layer-per-stage 8 \ --fp16 ``` where the `--fp16` flag will load and store models in fp16. Make sure to replace model_ckpts/llama-2-7b-32k-mqa/checkpoint_10` with the latest checkpoint in the `model_ckpts/llama-2-7b-32k-mqa` or `model_ckpts/llama-2-7b-32k-booksum` directory. # Reproducing Pythia-Chat-Base-7B This tutorial walks through reproducing the Pythia-Chat-Base-7B model by fine-tuning Eleuther AI's Pythia-6.9B-deduped model using the OIG dataset. ## Downloading training data and the base model The chat model was trained on the [OIG](https://huggingface.co/datasets/laion/OIG) dataset built by [LAION](https://laion.ai/), [Together](https://www.together.xyz/), and [Ontocord.ai](https://www.ontocord.ai/). To download the dataset from Hugging Face run the command below from the root of the repo. ```shell python data/OIG/prepare.py ``` > **Note** > You can help make this chat model better by contributing data! See the [OpenDataHub](https://github.com/togethercomputer/OpenDataHub) repo for more details. Once the command completes, the data will be in the `data/OIG/files` directory. Pythia-Chat-Base-7B is a fine-tuned variant of Pythia-6.9B-deduped from Eleuther AI. To download the model and prepare it for fine tuning, run this command from the root of the repo. ```shell python pretrained/Pythia-6.9B-deduped/prepare.py ``` The weights for this model will be in the `pretrained/Pythia-6.9B-deduped/EleutherAI_pythia-6.9b-deduped` directory. ## (Optional) 8bit Adam To use 8bit-adam during training, install the `bitsandbytes` package. ```shell pip install bitsandbytes # optional, to use 8bit-adam ``` ## Training the model The `training/finetune_Pythia-Chat-Base-7B.sh` script configures and runs the training loop. After downloading the dataset and the base model, run: ```shell bash training/finetune_Pythia-Chat-Base-7B.sh ``` As the training loop runs, checkpoints are saved to the `model_ckpts` directory at the root of the repo. Please see [the training README](training/README.md) for more details about customizing the training run. ## Converting weights to Hugging Face format Before you can use this model to perform inference, it must be converted to the Hugging Face format. Run this command from the root of the repo to do so. ```shell mkdir huggingface_models \ && python tools/convert_to_hf_gptneox.py \ --config-name EleutherAI/pythia-6.9b-deduped \ --ckpt-path model_ckpts/Pythia-Chat-Base-7B/checkpoint_100 \ --save-path huggingface_models/Pythia-Chat-Base-7B \ --n-stages 4 \ --n-layer-per-stage 8 \ --fp16 ``` where the `--fp16` flag will load and store models in fp16. Make sure to replace `model_ckpts/Pythia-Chat-Base-7B/checkpoint_100` with the latest checkpoint in the `model_ckpts/Pythia-Chat-Base-7B` directory. ## Testing the new model You can use the OpenChatKit Shell test harness to chat with the new model. From the root of the repo, run ```shell python inference/bot.py ``` By default the script will load the model named Pythia-Chat-Base-7B under the `huggingface_models` directory, but you can override that behavior by specifying `--model`. ```shell python inference/bot.py --model ./huggingface_models/GPT-NeoXT-Chat-Base-20B ``` Once the model has loaded, enter text at the prompt and the model will reply. ```shell $ python inference/bot.py Loading /home/csris/src/github.com/togethercomputer/OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:1... Welcome to OpenChatKit shell. Type /help or /? to list commands. >>> Hello. Hello human. >>> ``` The shell also supports additional commands to inspect hyperparamters, the full prompt, and more. Commands are prefixed with a `/`. > **Note** > The `/quit` command exits the shell. Please see [the inference README](inference/README.md) for more details about arguments, running on multiple/specific GPUs, and running on consumer hardware. # Monitoring By default, the training script simply prints the loss as training proceeds, but it can also output metrics to a file using [loguru](https://github.com/Delgan/loguru) or report them to Weights & Biases. ## Loguru Add the flag `--train-log-backend loguru` to your training script to log to `./logs/file_{time}.log` ## Weights & Biases To use Weights & Biases, first login with your Weights & Biases token. ```shell wandb login ``` And set `--train-log-backend wandb` in the training script to enable logging to Weights & Biases. # Experimental: Retrieval-Augmented Models > **Warning** > Retrieval support is experimental. The code in `/retrieval` implements a python package for querying a Faiss index of Wikipedia. The following steps explain how to use this index to augment queries in the test harness with context from the retriever. 1. Download the Wikipedia index. ```shell python data/wikipedia-3sentence-level-retrieval-index/prepare.py ``` 2. Run the bot with the `--retrieval` flag. ```shell python inference/bot.py --retrieval ``` After starting, the bot will load both the chat model and the retrieval index, which takes a long time. Once the model and the index are loaded, all queries will be augmented with extra context. ```shell $ python inference/bot.py --retrieval Loading /OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:0... Loading retrieval index... Welcome to OpenChatKit shell. Type /help or /? to list commands. >>> Where is Zurich? Where is Zurich? Zurich is located in Switzerland. >>> ``` # See Also * [docs/GPT-NeoXT-Chat-Base-20B.md](docs/GPT-NeoXT-Chat-Base-20B.md). OpenChatKit also provides a larger, 20B parameter chat model that was trained on GPT-NeoXT-Chat-Base-20B from Eleuther AI. # License All code in this repository was developed by Together Computer except where otherwise noted. Copyright (c) 2023, Together Computer. All rights reserved. The code is licensed under the Apache 2.0 license. ``` Copyright 2023 Together Computer Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ``` This repository also contains code written by a number of other authors. Such contributions are marked and the relevant licensing is included where appropriate. For full terms, see the LICENSE file. If you have any questions, comments, or concerns about licensing please [contact us](https://www.together.xyz/contact). # Citing OpenChatKit ```bibtex @software{openchatkit, title = {{OpenChatKit: An Open Toolkit and Base Model for Dialogue-style Applications}}, author = {Together Computer}, url = {https://github.com/togethercomputer/OpenChatKit} month = {3}, year = {2023}, version = {0.15}, } ``` # Acknowledgements Our models are fine-tuned versions of large language models trained by [Eleuther AI](https://www.eleuther.ai). We evaluated our model on [HELM](https://crfm.stanford.edu/helm/latest/) provided by the [Center for Research on Foundation Models](https://crfm.stanford.edu). And we collaborated with both [CRFM](https://crfm.stanford.edu) and [HazyResearch](http://hazyresearch.stanford.edu) at Stanford to build this model. We collaborated with [LAION](https://laion.ai/) and [Ontocord.ai](https://www.ontocord.ai/) to build the training data used to fine tune this model. ================================================ FILE: data/OIG/prepare.py ================================================ import sys import os # Import the prepare_data function current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(current_dir, '..')) from prepare_data import prepare_data if __name__ == "__main__": dest_dir = os.path.join(current_dir, "files") prepare_data("https://huggingface.co/datasets/laion/OIG", dest_dir) ================================================ FILE: data/OIG-chip2/prepare.sh ================================================ DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) wget https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl -O ${DIR}/unified_chip2.jsonl ================================================ FILE: data/OIG-moderation/prepare.py ================================================ import sys import os # Import the prepare_data function current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(current_dir, '..')) from prepare_data import prepare_data if __name__ == "__main__": dest_dir = os.path.join(current_dir, "files") prepare_data("https://huggingface.co/datasets/ontocord/OIG-moderation", dest_dir) ================================================ FILE: data/prepare_data.py ================================================ import argparse from shutil import copyfile import boto3 import botocore import glob import gzip import os import re import requests import shutil import subprocess import sys from urllib.parse import urlparse # Check if git-lfs is installed. def is_git_lfs_installed(): try: process = subprocess.run(['git', 'lfs', 'version'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) return process.returncode == 0 except FileNotFoundError: return False # Check if a url is a Hugging Face git URL. def is_huggingface_git_url(url): # Regular expression pattern for Hugging Face git URLs hf_git_pattern = r'^https://huggingface\.co/datasets/[A-Za-z0-9_\.\-/]+$' # Match the pattern against the URL # Return True if a match is found, False otherwise return re.match(hf_git_pattern, url) is not None # Check if the path is a GitHub repository URL. def is_github_repo_url(url): # Regular expression patterns for GitHub repository URLs ssh_pattern = r'^git@github\.com:[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+\.git$' http_pattern = r'^https?://(www\.)?github\.com/[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+\.git$' # Match the patterns against the path # Return True if a match is found in either SSH or HTTP pattern, False otherwise return re.match(ssh_pattern, url) is not None or re.match(http_pattern, url) is not None # Check if the path is an S3 or R2 repository URL. def is_s3_url(url): # Regular expression pattern for S3 URLs s3_pattern = r'^https?://(s3(-[a-z0-9-]+)?\.amazonaws|[a-fA-F0-9]+\.r2\.cloudflarestorage)\.com/[a-z0-9][a-z0-9\.\-]{1,61}[a-z0-9]/[0-9a-zA-Z!\-_\.*\'()/]+$' # Match the pattern against the URL # Return True if a match is found, False otherwise if re.match(s3_pattern, url) is None: return False # Check for a valid bucket name bucket_name = url.split('/')[3] if bucket_name.startswith("xn--"): return False if bucket_name.endswith("-s3alias"): return False if bucket_name.endswith("--ol-s3"): return False if re.match(r'\.\.', bucket_name) is not None: return False if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', bucket_name) is not None: return False if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', bucket_name) is not None: return False return True # Check that the current git repository has git-lfs installed. If the git-lfs # is not installed, then run `git lfs install` if git-lfs is installed. If # git-lfs is not installed, then print an error message and exit. def clone_git_repo(data_source, destination_dir): process = subprocess.run( 'git lfs env | grep -q \'git config filter.lfs.smudge = "git-lfs smudge -- %f"\'', shell=True ) # Check if the git repository has already been cloned if os.path.exists(os.path.join(destination_dir, ".git")): print(f"Git repository already exists at {destination_dir}. Skipping clone.") return # Check if git-lfs is installed if process.returncode != 0 and is_git_lfs_installed(): subprocess.run('git lfs install', shell=True, check=True) process = subprocess.run( 'git lfs install', shell=True ) if process.returncode != 0: print('error: git lfs not installed. please install git-lfs and run `git lfs install`') sys.exit(1) # Clone a GitHub repository. try: subprocess.run(f"git clone {data_source} {destination_dir}", shell=True, check=True) except subprocess.CalledProcessError: print(f"error: failed to clone repository {data_source}") sys.exit(1) # Download all files from an S3 compatible storage service. def download_from_s3(url, destination_dir, access_key_id = None, secret_access_key = None, session_token = None, debug = False): # Get the access key ID and secret access key from the environment variables if access_key_id is None: access_key_id = os.environ.get('AWS_ACCESS_KEY_ID') if secret_access_key is None: secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY') if session_token is None: session_token = os.environ.get('AWS_SESSION_TOKEN') # Create an S3 client parsed_url = url.split('/') endpoint_url = f"{parsed_url[0]}//{parsed_url[2]}" bucket_name = parsed_url[3] key_prefix = "/".join(parsed_url[4:-1]) base_file = parsed_url[-1] if not url.endswith('/') else "" print(f"endpoint_url={endpoint_url} ...") if debug: print(f"access_key_id={access_key_id}") print(f"secret_access_key={secret_access_key}") print(f"bucket_name={bucket_name}") print(f"key_prefix={key_prefix}") print(f"base_file={base_file}") s3 = boto3.resource('s3', endpoint_url = endpoint_url, aws_access_key_id = access_key_id, aws_secret_access_key = secret_access_key, aws_session_token=session_token, region_name = "auto" ) # Create the destination directory if it does not exist os.makedirs(destination_dir, exist_ok=True) try: print(f"Downloading file(s) from S3 {url} to {destination_dir} ...") bucket = s3.Bucket(bucket_name) # Otherwise, download the file at the prefix if url.endswith('/'): # Download the file from the S3 path for obj in bucket.objects.filter(Prefix=key_prefix): if not obj.key.endswith('/'): destination_file = os.path.join(destination_dir, os.path.basename(obj.key)) if not os.path.exists(destination_file): print(f"Downloading {obj.key} ...") bucket.download_file(obj.key, destination_file) else: print(f"File already exists, skipping {obj.key}") else: destination_file = os.path.join(destination_dir, base_file) if not os.path.exists(destination_file): print(f"Downloading {base_file} ...") bucket.download_file(f'/{key_prefix}/{base_file}', destination_file) else: print(f"File already exists, skipping {base_file}") print("Download completed successfully.") return except botocore.exceptions.NoCredentialsError: print("Error: AWS credentials not found.") except botocore.exceptions.EndpointConnectionError: print("Error: Unable to connect to the S3 endpoint.") except botocore.exceptions.ParamValidationError as e: print(f"Error: Invalid S3 URL: {e}") except botocore.exceptions.ClientError as e: print(f"Error: {e.response['Error']['Message']}") # Something went wrong, exit with error. sys.exit(1) def download_from_url(url, destination_dir): print(f"Downloading file from {url} to {destination_dir} ...") try: # Parse the URL to extract the filename parsed_url = urlparse(url) filename = os.path.basename(parsed_url.path) # Construct the destination file path destination_file = os.path.join(destination_dir, filename) # Download the file response = requests.get(url, stream=True) response.raise_for_status() with open(destination_file, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print("Download completed successfully.") return except requests.exceptions.HTTPError as e: print(f"Error: {e}") except requests.exceptions.ConnectionError: print("Error: Unable to connect to the URL.") except requests.exceptions.Timeout: print("Error: Connection timed out.") except requests.exceptions.RequestException as e: print(f"Error: {e}") # Something went wrong, exit with error. sys.exit(1) # Perepare data will clone the git repository given by data_source into the # destination_dir. def prepare_data(data_source, destination_dir, access_key_id=None, secret_access_key=None, debug=False): # Check that destination_dir is a directory. If it does not exist, then # create it. if not os.path.exists(destination_dir): os.makedirs(destination_dir) elif not os.path.isdir(destination_dir): print(f"Error: {destination_dir} is not a directory.") sys.exit(1) if os.path.isfile(data_source): # Handle the case where the data source is a local file print(f"Copying file {data_source} to {destination_dir} ...") copyfile(data_source, destination_dir) elif is_github_repo_url(data_source) or is_huggingface_git_url(data_source): # Handle the case where the data source is a GitHub or Hugging Face repository clone_git_repo(data_source, destination_dir) elif is_s3_url(data_source): # Handle the case where the data source is an S3 URL download_from_s3(url=data_source, destination_dir=destination_dir, access_key_id=access_key_id, secret_access_key=secret_access_key, debug=debug) elif data_source.startswith('http://') or data_source.startswith('https://'): # Handle the case where the data source is a URL download_from_url(data_source, destination_dir) else: print(f"Error: Invalid data source: {data_source}") sys.exit(1) # Extract gzipped files, if present for file_path in glob.glob(f"{destination_dir}/*.gz"): out_path, _ = os.path.splitext(file_path) with gzip.open(file_path, 'rb') as infile, open(out_path, 'wb') as outfile: shutil.copyfileobj(infile, outfile) os.remove(file_path) def main(): parser = argparse.ArgumentParser(description="Script for cloning a git repository and extracting files.") parser.add_argument("-s", "--data-source", required=True, help="URL of the data source (git repository)") parser.add_argument("-d", "--dest", required=True, help="Destination directory to clone the repository and extract files") parser.add_argument("-a", "--access-key-id", required=False, help="AWS access key ID") parser.add_argument("-k", "--secret-access-key", required=False, help="AWS secret access key") parser.add_argument("--debug", action='store_true', help="Enable debug mode") args = parser.parse_args() prepare_data(args.data_source, args.dest, args.access_key_id, args.secret_access_key, args.debug) if __name__ == "__main__": main() ================================================ FILE: data/wikipedia-3sentence-level-retrieval-index/prepare.py ================================================ import sys import os # Import the prepare_data function current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(current_dir, '..')) from prepare_data import prepare_data if __name__ == "__main__": dest_dir = os.path.join(current_dir, "files") prepare_data("https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index", dest_dir) ================================================ FILE: docs/GPT-NeoXT-Chat-Base-20B.md ================================================ # GPT-NeoXT-Chat-Base-20B OpenChatKit includes an instruction-tuned 20 billion parameter language model called GPT-NeoXT-Chat-Base-20B, a 6 billion parameter moderation model, and an extensible retrieval system for including up-to-date responses from custom repositories. It was trained on the OIG-43M training dataset, which was a collaboration between [Together](https://www.together.xyz/), [LAION](https://laion.ai), and [Ontocord.ai](https://ontocord.ai). Much more than a model release, this is the beginning of an open source project. We are releasing a set of tools and processes for ongoing improvement with community contributions. In this doc, you'll find steps for: - Training an OpenChatKit model - Testing inference using the model - Augmenting the model with additional context from a retrieval index # Contents - [Requirements](#requirements) - [Pre-trained Weights](#pre-trained-weights) - [Datasets](#datasets) * [Data Contributions](#data-contributions) - [Pretrained Base Model](#pretrained-base-model) - [Training and Finetuning](#training-and-finetuning) * [(Optional) 8bit Adam](#optional-8bit-adam) * [Train GPT-NeoX-Chat-Base-20B](#train-gpt-neox-chat-base-20b) - [Converting Weights to Huggingface Format](#converting-weights-to-huggingface-format) - [Inference](#inference) - [Monitoring](#monitoring) * [Loguru](#loguru) * [Weights & Biases](#weights--biases) - [Experimental: Retrieval-Augmented Models](#experimental-retrieval-augmented-models) - [Acknowledgements](#acknowledgements) # Requirements Before you begin, you need to install PyTorch and other dependencies. 1. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) from their website. 2. Install [Git LFS](https://git-lfs.com/) from their website. 3. Install the `git lfs` hooks. ```shell git lfs install ``` 4. Install mamba in the `base` environment so it's available in all environments. ```shell conda install mamba -n base -c conda-forge ``` 5. Create an environment called OpenChatKit using the `environment.yml` file at the root of this repo. ```shell mamba env create -f environment.yml ``` 6. Activate the new conda environment. ```shell conda activate OpenChatKit ``` # Pre-trained Weights GPT-NeoXT-Chat-Base-20B is a 20B-parameter variant of GPT-NeoX, fine-tuned on conversational datasets. We are releasing pre-trained weights for this model as [togethercomputer/GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B) on Huggingface. More details can be found on the model card for [GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B) on Huggingface. # Datasets The chat model was trained on the [OIG](https://huggingface.co/datasets/laion/OIG) dataset built by [LAION](https://laion.ai/), [Together](https://www.together.xyz/), and [Ontocord.ai](https://www.ontocord.ai/). To download the dataset from Huggingface run the command below from the root of the repo. ```shell python data/OIG/prepare.py ``` Once the command completes, the data will be in the `data/OIG/files` directory. ## Data Contributions You can help make this chat model better by contributing data! See the [OpenDataHub](https://github.com/togethercomputer/OpenDataHub) repo for more details. # Pretrained Base Model As mentioned above, the chat model is a fine-tuned variant of GPT-NeoX-20B from Eleuther AI. To download GPT-NeoX-20B and prepare it for fine tuning, run this command from the root of the repo. ```shell python pretrained/GPT-NeoX-20B/prepare.py ``` The weights for this model will be in the `pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b`. In case you want to fine-tune other gpt-neox models, e.g. [the Pythia model suite](https://huggingface.co/models?sort=downloads&search=pythia), you can specify the HF model name, for example: ```shell python pretrained/GPT-NeoX-20B/prepare.py --model-name EleutherAI/pythia-6.9b-deduped ``` And the weights for this model will be in the `pretrained/GPT-NeoX-20B/EleutherAI_pythia-6.9b-deduped`. # Training and Finetuning ## (Optional) 8bit Adam To use 8bit-adam during training, install the `bitsandbytes` package. ```shell pip install bitsandbytes # optional, to use 8bit-adam ``` ## Train GPT-NeoX-Chat-Base-20B The `training/finetune_GPT-NeoXT-Chat-Base-20B.sh` script configures and runs the training loop. After downloading the dataset and the base model, run: ```shell bash training/finetune_GPT-NeoXT-Chat-Base-20B.sh ``` The script launches 8 processes with a pipeline-parallel degree of 8 and a data-parallel degree of 1. As the training loop runs, checkpoints are saved to the `model_ckpts` directory at the root of the repo. Please see [the training README](training/README.md) for more details about customizing the training run. The `training/finetune_Pythia-Chat-Base-7B.sh` script is another example to fine-tune a 7B pythia (gpt-neox) model. The script launches 8 processes with a pipeline-parallel degree of 4 and a data-parallel degree of 2. # Converting Weights to Huggingface Format Before you can use this model to perform inference, it must be converted to the Huggingface format. Run this command from the root of the repo to do so. ```shell mkdir huggingface_models \ && python tools/convert_to_hf_gptneox.py \ --ckpt-path model_ckpts/GPT-Neo-XT-Chat-Base-20B/checkpoint_100 \ --save-path huggingface_models/GPT-NeoXT-Chat-Base-20B \ --n-stages 8 \ --n-layer-per-stage 6 \ --fp16 ``` where the `--fp16` flag will load and store models in fp16. Make sure to replace `model_ckpts/GPT-Neo-XT-Chat-Base-20B/checkpoint_100` with the latest checkpoint in the `model_ckpts/GPT-Neo-XT-Chat-Base-20B` directory. If you need to convert ckpts of other gpt-neox variants, make sure to specify the correct config name for your variant. For example, if you want to convert a checkpoint fine-tuned from `EleutherAI/pythia-6.9b-deduped`, you should indicate this as a config name: ```shell python tools/convert_to_hf_gptneox.py \ --config-name EleutherAI/pythia-6.9b-deduped \ --ckpt-path model_ckpts/Pythia-Chat-Base-7B/checkpoint_100 \ --save-path huggingface_models/Pythia-Chat-Base-7B \ --n-stages 4 \ --n-layer-per-stage 8 \ --fp16 ``` # Inference To help you test the model, we provide a simple test command line test harness to interact with the bot. ```shell python inference/bot.py ``` By default the script will load the model named GPT-NeoXT-Chat-Base-20B model under the `huggingface_models` directory, but you can override that behavior by specifying `--model`. For example, if you want to load the base model from our Huggingface, repo, you can run the following command which downloads the weights from HuggingFace. ```shell python inference/bot.py --model togethercomputer/GPT-NeoXT-Chat-Base-20B ``` Once the model has loaded, enter text at the prompt and the model will reply. ```shell $ python inference/bot.py Loading /home/csris/src/github.com/togethercomputer/OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:1... Welcome to OpenChatKit shell. Type /help or /? to list commands. >>> Hello. Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. Hello human. >>> ``` Commands are prefixed with a `/`, and the `/quit` command exits. Please see [the inference README](inference/README.md) for more details about arguments, running on multiple/specific GPUs, and running on consumer hardware. # Monitoring By default, the training script simply prints the loss as training proceeds, but it can also output metrics to a file using [loguru](https://github.com/Delgan/loguru) or report them to Weights & Biases. ## Loguru Add the flag `--train-log-backend loguru` to your training script to log to `./logs/file_{time}.log` ## Weights & Biases To use Weights & Biases, first login with your Weights & Biases token. ```shell wandb login ``` And set `--train-log-backend wandb` in the training script to enable logging to Weights & Biases. # Experimental: Retrieval-Augmented Models *Note: Retrieval is still experimental.* The code in `/retrieval` implements a python package for querying a Faiss index of Wikipedia. The following steps explain how to use this index to augment queries in the test harness with context from the retriever. 1. Download the Wikipedia index. ```shell python data/wikipedia-3sentence-level-retrieval-index/prepare.py ``` 2. Run the bot with the `--retrieval` flag. ```shell python inference/bot.py --retrieval ``` After starting, the bot will load both the chat model and the retrieval index, which takes a long time. Once the model and the index are loaded, all queries will be augmented with extra context. ```shell $ python inference/bot.py --retrieval Loading /OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:0... Loading retrieval index... Welcome to OpenChatKit shell. Type /help or /? to list commands. >>> Where is Zurich? Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. Where is Zurich? Zurich is located in Switzerland. >>> ``` # Acknowledgements Our model is a fine-tuned version of [gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b), a large language model trained by [Eleuther AI](https://www.eleuther.ai). We evaluated our model on [HELM](https://crfm.stanford.edu/helm/latest/) provided by the [Center for Research on Foundation Models](https://crfm.stanford.edu). And we collaborated with both [CRFM](https://crfm.stanford.edu) and [HazyResearch](http://hazyresearch.stanford.edu) at Stanford to build this model. We collaborated with [LAION](https://laion.ai/) and [Ontocord.ai](https://www.ontocord.ai/) to build the training data used to fine tune this model. ================================================ FILE: docs/finetuning-RedPajama-3B.md ================================================ # RedPajama-3B In this tutorial, you will learn how to fine-tune a base LLM on a sample of data. By the end of the tutorial, you will have fine-tuned the RedPajama-INCITE-Chat-3B model using a sample of chat data from the OIG dataset. You can adapt this tutorial to fine-tune with your own data. In order to fine-tune the RedPajama 3B models, please follow these steps: First clone the OpenChatKit repo: ```shell git clone git@github.com:togethercomputer/OpenChatKit.git ``` Next install dependencies as instructed by the OpenChatKit repo. # Prepare Weights ```shell python pretrained/RedPajama-3B/prepare.py ``` This script will download the weight from HuggingFace and prepare it for finetuning. The prepared weights will be saved at ``` pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1 ``` # Prepare Fine Tuning Data We now need to preapre the training data. We provide an example script that downloads a small slice of data from OIG. To download this sample dataset, please run: ``` bash data/OIG-chip2/prepare.sh ```` The sample dataset will be saved at ``` data/OIG-chip2/unified_chip2.jsonl. ``` # Run Fine Tuning Script We provide an example training script. Please configure the parameters (e.g., learning_rate, batch_size, dataset_path) according to your hardware configuration. Then to start training, simply run ``` bash training/finetune_RedPajama-INCITE-Chat-3B-v1.sh ``` # Convert to Huggingface Format The fine-tuned model will be saved to ``` model_ckpts/rp-incite-chat-3b-finetuned/checkpoint_{steps} ``` In order to use it for inference, you will need to convert it to the HuggingFace format. To do so, run the following script (as an example, please change the checkpoint path, n-stages and n-layer-per-stage according to the training script): The default for n-stages used in the training script is 10 and the n-layer-per-stage is 8. ``` python tools/convert_to_hf_gptneox.py --config-name togethercomputer/RedPajama-INCITE-Chat-3B-v1 --ckpt-path model_ckpts/redpajama-incite-chat-3b-sample/checkpoint_10/ --save-path model_ckpts/hf --n-stages 4 --n-layer-per-stage 8 ``` Then you are ready to go! You can load the model with HuggingFace and use it for inference, for example: ```python import torch import transformers from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Chat-3B-v1") model = AutoModelForCausalLM.from_pretrained("./model_ckpts/hf", torch_dtype=torch.float16) model = model.to('cuda:0') prompt = ": Who is Alan Turing?\n:" inputs = tokenizer(prompt, return_tensors='pt').to(model.device) input_length = inputs.input_ids.shape[1] outputs = model.generate( **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True ) token = outputs.sequences[0, input_length:] output_str = tokenizer.decode(token) print(output_str) ``` Please note the above finetuning takes around 60GB VRAM to fit everything in to GPU, and may take even more to fit training data. If you do not have such GPUs, we also provide the low-rank finetuning scripts that works with 14GB VRAM. Here’re the steps to get started. * Clone the OpenChatKit repo, install dependencies and prepare the dataset. These steps are the same as full fine-tuning. * The sample low-rank finetuning script is at /training/lora/redpajama-incite-chat-3b.py, please modify this script to accommodate your own training data and preferred configuration. * Then you can start low-rank finetuning by running this script. Once the finetuning is finished, the resulting low-rank adapter will be saved to /outputs, and you can do inference with the following script. ``` python training/lora/redpajama-incite-chat-3b_inference.py ``` ================================================ FILE: environment.yml ================================================ name: OpenChatKit channels: - pytorch - nvidia - conda-forge - defaults dependencies: - cudatoolkit=11.8.0 - cupy=12.1.0 - faiss-gpu=1.7.2 - fastparquet=0.5.0 - nccl=2.18.3.1 - pip=23.2 - pyarrow=12.0.1 - python=3.10.9 - python-snappy=0.6.1 - pytorch=2.0.1 - pytorch-cuda=11.8 - snappy=1.1.9 - torchaudio=2.0.2 - torchvision=0.15.2 - pip: - accelerate==0.21.0 - boto3 - datasets==2.13.1 - loguru==0.6.0 - netifaces==0.11.0 - pandas==2.0.3 - transformers==4.31.0 - wandb==0.15.5 - zstandard==0.21.0 - sentencepiece ================================================ FILE: inference/README.md ================================================ # OpenChatKit Inference This directory contains code for OpenChatKit's inference. ## Arguments - `--gpu-id`: Primary GPU device to load inputs onto for inference. Default: `0` - `--model`: name/path of the model. Default = `../huggingface_models/GPT-NeoXT-Chat-Base-20B` - `--max-tokens`: the maximum number of tokens to generate. Default: `128` - `--sample`: indicates whether to sample. Default: `True` - `--temperature`: temperature for the LM. Default: `0.6` - `--top-k`: top-k for the LM. Default: `40` - `--retrieval`: augment queries with context from the retrieval index. Default `False` - `-g` `--gpu-vram`: GPU ID and VRAM to allocate to loading the model, separated by a `:` in the format `ID:RAM` where ID is the CUDA ID and RAM is in GiB. `gpu-id` must be present in this list to avoid errors. Accepts multiple values, for example, `-g ID_0:RAM_0 ID_1:RAM_1 ID_N:RAM_N` - `-r` `--cpu-ram`: CPU RAM overflow allocation for loading the model. Optional, and only used if the model does not fit onto the GPUs given. ## Hardware requirements for inference The GPT-NeoXT-Chat-Base-20B model requires at least 41GB of free VRAM. Used VRAM also goes up by ~100-200 MB per prompt. - A **minimum of 80 GB is recommended** - A **minimum of 48 GB in VRAM is recommended** for fast responses. If you'd like to run inference on a GPU with <48 GB VRAM, refer to this section on [running on consumer hardware](#running-on-consumer-hardware). By default, inference uses only CUDA Device 0. **NOTE: Inference currently requires at least 1x GPU.** ## Running on multiple GPUs Add the argument ```-g ID0:MAX_VRAM ID1:MAX_VRAM ID2:MAX_VRAM ...``` where IDx is the CUDA ID of the device and MAX_VRAM is the amount of VRAM you'd like to allocate to the device. For example, if you are running this on 4x 48 GB GPUs and want to distribute the model across all devices, add ```-g 0:10 1:12 2:12 3:12 4:12```. In this example, the first device gets loaded to a max of 10 GiB while the others are loaded with a max of 12 GiB. How it works: The model fills up the max available VRAM on the first device passed and then overflows into the next until the whole model is loaded. **IMPORTANT: This MAX_VRAM is only for loading the model. It does not account for the additional inputs that are added to the device. It is recommended to set the MAX_VRAM to be at least 1 or 2 GiB less than the max available VRAM on each device, and at least 3GiB less than the max available VRAM on the primary device (set by `gpu-id` default=0).** **Decrease MAX_VRAM if you run into CUDA OOM. This happens because each input takes up additional space on the device.** **NOTE: Total MAX_VRAM across all devices must be > size of the model in GB. If not, `bot.py` automatically offloads the rest of the model to RAM and disk. It will use up all available RAM. To allocate a specified amount of RAM: [refer to this section on running on consumer hardware](#running-on-consumer-hardware).** ## Running on specific GPUs If you have multiple GPUs but would only like to use a specific device(s), [use the same steps as in this section on running on multiple devices](#running-on-multiple-gpus) and only specify the devices you'd like to use. Also, if needed, add the argument `--gpu-id ID` where ID is the CUDA ID of the device you'd like to make the primary device. NOTE: The device specified in `--gpu-id` must be present as one of the ID in the argument `-g` to avoid errors. - **Example #1**: to run inference on devices 2 and 5 with a max of 25 GiB on each, and make device 5 the primary device, add: `--gpu-id 5 -g 2:25 5:25`. In this example, not adding `--gpu-id 5` will give you an error. - **Example #2**: to run inference on devices 0 and 3 with a max of 10GiB on 0 and 40GiB on 3, with device 0 as the primary device, add: `-g 0:10 3:40`. In this example, `--gpu-id` is not required because device 0 is specified in `-g`. - **Example #3**: to run inference only on device 1 with a max of 75 GiB, add: `--gpu-id 1 -g 1:75` ## Running on consumer hardware If you have multiple GPUs, each <48 GB VRAM, [the steps mentioned in this section on running on multiple GPUs](#running-on-multiple-gpus) still apply, unless, any of these apply: - Running on just 1x GPU with <48 GB VRAM, - <48 GB VRAM combined across multiple GPUs - Running into Out-Of-Memory (OOM) issues In which case, add the flag `-r CPU_RAM` where CPU_RAM is the maximum amount of RAM you'd like to allocate to loading model. Note: This significantly reduces inference speeds. The model will load without specifying `-r`, however, it is not recommended because it will allocate all available RAM to the model. To limit how much RAM the model can use, add `-r`. If the total VRAM + CPU_RAM < the size of the model in GiB, the rest of the model will be offloaded to a folder "offload" at the root of the directory. Note: This significantly reduces inference speeds. - Example: `-g 0:12 -r 20` will first load up to 12 GiB of the model into the CUDA device 0, then load up to 20 GiB into RAM, and load the rest into the "offload" directory. How it works: - https://github.com/huggingface/blog/blob/main/accelerate-large-models.md - https://www.youtube.com/embed/MWCSGj9jEAo ================================================ FILE: inference/bot.py ================================================ import os import sys INFERENCE_DIR = os.path.dirname(os.path.abspath(__file__)) # TODO: PYTHONPATH hacks are never a good idea. clean this up later sys.path.append(os.path.join(INFERENCE_DIR, '..')) import cmd import torch import argparse import conversation as convo import retrieval.wikipedia as wp from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, StoppingCriteria, StoppingCriteriaList from accelerate import infer_auto_device_map, init_empty_weights class StopWordsCriteria(StoppingCriteria): def __init__(self, tokenizer, stop_words, stream_callback): self._tokenizer = tokenizer self._stop_words = stop_words self._partial_result = '' self._stream_buffer = '' self._stream_callback = stream_callback def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: first = not self._partial_result text = self._tokenizer.decode(input_ids[0, -1]) self._partial_result += text for stop_word in self._stop_words: if stop_word in self._partial_result: return True if self._stream_callback: if first: text = text.lstrip() # buffer tokens if the partial result ends with a prefix of a stop word, e.g. " 40 GB VRAM # load model onto one device if max_memory is None: self._model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto") self._model.to(device) # load the model with the given max_memory config (for devices with insufficient VRAM or multi-gpu) else: config = AutoConfig.from_pretrained(model_name) # load empty weights with init_empty_weights(): model_from_conf = AutoModelForCausalLM.from_config(config) model_from_conf.tie_weights() # create a device_map from max_memory device_map = infer_auto_device_map( model_from_conf, max_memory=max_memory, no_split_module_classes=["GPTNeoXLayer"], dtype="float16" ) # load the model with the above device_map self._model = AutoModelForCausalLM.from_pretrained( model_name, device_map=device_map, offload_folder="offload", # optional offload-to-disk overflow directory (auto-created) offload_state_dict=True, torch_dtype=torch.float16 ) self._tokenizer = AutoTokenizer.from_pretrained(model_name) def do_inference(self, prompt, max_new_tokens, do_sample, temperature, top_k, stream_callback=None): stop_criteria = StopWordsCriteria(self._tokenizer, [self.human_id], stream_callback) inputs = ( self._tokenizer(prompt, return_tensors='pt') .to(self._model.device) ) outputs = self._model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=temperature, top_k=top_k, pad_token_id=self._tokenizer.eos_token_id, stopping_criteria=StoppingCriteriaList([stop_criteria]), ) output = self._tokenizer.batch_decode(outputs)[0] # remove the context from the output output = output[len(prompt):] return output class OpenChatKitShell(cmd.Cmd): intro = "Welcome to OpenChatKit shell. Type /help or /? to list commands.\n" prompt = ">>> " def __init__(self, gpu_id, model_name_or_path, max_tokens, sample, temperature, top_k, retrieval, max_memory, do_stream): super().__init__() self._gpu_id = gpu_id self._model_name_or_path = model_name_or_path self._max_tokens = max_tokens self._sample = sample self._temperature = temperature self._top_k = top_k self._retrieval = retrieval self._max_memory = max_memory self._do_stream = do_stream def preloop(self): print(f"Loading {self._model_name_or_path} to cuda:{self._gpu_id}...") self._model = ChatModel(self._model_name_or_path, self._gpu_id, self._max_memory) if self._retrieval: print(f"Loading retrieval index...") self._index = wp.WikipediaIndex() self._convo = convo.Conversation( self._model.human_id, self._model.bot_id) def precmd(self, line): if line.startswith('/'): return line[1:] else: return 'say ' + line def do_say(self, arg): if self._retrieval: results = self._index.search(arg) if len(results) > 0: self._convo.push_context_turn(results[0]) self._convo.push_human_turn(arg) output = self._model.do_inference( self._convo.get_raw_prompt(), self._max_tokens, self._sample, self._temperature, self._top_k, lambda x : print(x, end='', flush=True) if self._do_stream else None, ) self._convo.push_model_response(output) print("" if self._do_stream else self._convo.get_last_turn()) def do_raw_say(self, arg): output = self._model.do_inference( arg, self._max_tokens, self._sample, self._temperature, self._top_k ) print(output) def do_raw_prompt(self, arg): print(self._convo.get_raw_prompt()) def do_reset(self, arg): self._convo = convo.Conversation( self._model.human_id, self._model.bot_id) def do_hyperparameters(self, arg): print( f"Hyperparameters:\n" f" max_tokens: {self._max_tokens}\n" f" sample: {self._sample}\n" f" temperature: {self._temperature}\n" f" top_k: {self._top_k}" ) def do_quit(self, arg): return True def main(): parser = argparse.ArgumentParser( description='test harness for OpenChatKit') parser.add_argument( '--gpu-id', default=0, type=int, help='the ID of the GPU to run on' ) parser.add_argument( '--model', default=f"{INFERENCE_DIR}/../huggingface_models/Pythia-Chat-Base-7B", help='name/path of the model' ) parser.add_argument( '--max-tokens', default=128, type=int, help='the maximum number of tokens to generate' ) parser.add_argument( '--sample', default=True, action='store_true', help='indicates whether to sample' ) parser.add_argument( '--no-stream', action='store_true', help='indicates whether to stream tokens' ) parser.add_argument( '--temperature', default=0.6, type=float, help='temperature for the LM' ) parser.add_argument( '--top-k', default=40, type=int, help='top-k for the LM' ) parser.add_argument( '--retrieval', default=False, action='store_true', help='augment queries with context from the retrieval index' ) parser.add_argument( '-g', '--gpu-vram', action='store', help='max VRAM to allocate per GPU', nargs='+', required=False, ) parser.add_argument( '-r', '--cpu-ram', default=None, type=int, help='max CPU RAM to allocate', required=False ) args = parser.parse_args() # set max_memory dictionary if given if args.gpu_vram is None: max_memory = None else: max_memory = {} for i in range(len(args.gpu_vram)): # assign CUDA ID as label and XGiB as value max_memory[int(args.gpu_vram[i].split(':')[0])] = f"{args.gpu_vram[i].split(':')[1]}GiB" if args.cpu_ram is not None: # add cpu to max-memory if given max_memory['cpu'] = f"{int(args.cpu_ram)}GiB" OpenChatKitShell( args.gpu_id, args.model, args.max_tokens, args.sample, args.temperature, args.top_k, args.retrieval, max_memory, not args.no_stream, ).cmdloop() if __name__ == '__main__': main() ================================================ FILE: inference/conversation.py ================================================ import re import time MEANINGLESS_WORDS = ['', '', '<|endoftext|>'] PRE_PROMPT = """\ Current Date: {} Current Time: {} """ def clean_response(response): for word in MEANINGLESS_WORDS: response = response.replace(word, "") response = response.strip("\n") return response class Conversation: def __init__(self, human_id, bot_id): cur_date = time.strftime('%Y-%m-%d') cur_time = time.strftime('%H:%M:%S %p %Z') self._human_id = human_id self._bot_id = bot_id self._prompt = PRE_PROMPT.format(cur_date, cur_time) def push_context_turn(self, context): # for now, context is represented as a human turn self._prompt += f"{self._human_id}: {context}\n" def push_human_turn(self, query): self._prompt += f"{self._human_id}: {query}\n" self._prompt += f"{self._bot_id}:" def push_model_response(self, response): has_finished = self._human_id in response bot_turn = response.split(f"{self._human_id}:")[0] bot_turn = clean_response(bot_turn) # if it is truncated, then append "..." to the end of the response if not has_finished: bot_turn += "..." self._prompt += f"{bot_turn}\n" def get_last_turn(self): human_tag = f"{self._human_id}:" bot_tag = f"{self._bot_id}:" turns = re.split(f"({human_tag}|{bot_tag})\W?", self._prompt) return turns[-1] def get_raw_prompt(self): return self._prompt @classmethod def from_raw_prompt(cls, value): self._prompt = value ================================================ FILE: pretrained/GPT-NeoX-20B/prepare.py ================================================ import sys import os # Import the prepare_data function current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(current_dir, '..')) from prepare_pretrained import prepare_pretrained if __name__ == "__main__": model_name = "EleutherAI/gpt-neox-20b" save_path = os.path.join(current_dir, model_name.replace('/', '_')) prepare_pretrained(save_path, model_name) ================================================ FILE: pretrained/Llama-2-7B-32K-beta/prepare.py ================================================ import os import argparse import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig DIR = os.path.dirname(os.path.abspath(__file__)) USE_AUTH_TOKEN = False if __name__ == '__main__': parser = argparse.ArgumentParser(description='Convert HF checkpoints') parser.add_argument('--model-name', type=str, default='togethercomputer/Llama-2-7B-32K-beta', help='model-name') parser.add_argument('--save-dir', type=str, default=DIR, help='model-name') parser.add_argument('--offload-dir', type=str, default=None, help='directory to offload from memory') args = parser.parse_args() if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_')) if not os.path.exists(save_path): os.mkdir(save_path) print('loading model from HF...') config = AutoConfig.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN) config.save_pretrained(save_path) tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN) tokenizer.save_pretrained(save_path) # offload model from memory to disk if offload-dir is specified if args.offload_dir is not None: if not os.path.exists(args.offload_dir): os.mkdir(args.offload_dir) model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, device_map="auto", offload_folder=args.offload_dir, use_auth_token=USE_AUTH_TOKEN) else: model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, use_auth_token=USE_AUTH_TOKEN) print('loaded model from HF...') print('converting the embedding layer...') item = {} item['embed_tokens.weight'] = model.model.embed_tokens.weight torch.save(item, os.path.join(save_path, 'pytorch_embs.pt')) print('converted the embedding layer.') for i in range(len(model.model.layers)): print(f'converting the {i}-th transformer layer...') torch.save(model.model.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt')) print(f'converted the {i}-th transformer layer.') print('converting the lm_head layer...') item = {} item['lm_head.weight'] = model.lm_head.weight item['norm.weight'] = model.model.norm.weight torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt')) print('converted the lm_head layer.') ================================================ FILE: pretrained/Pythia-6.9B-deduped/prepare.py ================================================ import sys import os # Import the prepare_data function current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(current_dir, '..')) from prepare_pretrained import prepare_pretrained if __name__ == "__main__": model_name = "EleutherAI/pythia-6.9b-deduped" save_path = os.path.join(current_dir, model_name.replace('/', '_')) prepare_pretrained(save_path, model_name) ================================================ FILE: pretrained/RedPajama-3B/prepare.py ================================================ import os import sys # Import the prepare_data function current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(current_dir, '..')) from prepare_pretrained import prepare_pretrained if __name__ == "__main__": model_name = "togethercomputer/RedPajama-INCITE-Chat-3B-v1" save_path = os.path.join(current_dir, model_name.replace('/', '_')) prepare_pretrained(save_path, model_name) ================================================ FILE: pretrained/RedPajama-7B/prepare.py ================================================ import os import argparse import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig DIR = os.path.dirname(os.path.abspath(__file__)) USE_AUTH_TOKEN = False if __name__ == '__main__': parser = argparse.ArgumentParser(description='Convert HF checkpoints') parser.add_argument('--model-name', type=str, default='togethercomputer/RedPajama-INCITE-7B-Chat', help='model-name') parser.add_argument('--save-dir', type=str, default=DIR, help='model-name') parser.add_argument('--offload-dir', type=str, default=None, help='directory to offload from memory') args = parser.parse_args() if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_')) if not os.path.exists(save_path): os.mkdir(save_path) print('loading model from HF...') config = AutoConfig.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN) config.save_pretrained(save_path) tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN) tokenizer.save_pretrained(save_path) # offload model from memory to disk if offload-dir is specified if args.offload_dir is not None: if not os.path.exists(args.offload_dir): os.mkdir(args.offload_dir) model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, device_map="auto", offload_folder=args.offload_dir, use_auth_token=USE_AUTH_TOKEN) else: model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, use_auth_token=USE_AUTH_TOKEN) print('loaded model from HF...') print('converting the embedding layer...') item = {} item['embed_in.weight'] = model.gpt_neox.embed_in.weight torch.save(item, os.path.join(save_path, 'pytorch_embs.pt')) print('converted the embedding layer.') for i in range(len(model.gpt_neox.layers)): print(f'converting the {i}-th transformer layer...') torch.save(model.gpt_neox.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt')) print(f'converted the {i}-th transformer layer.') print('converting the lm_head layer...') item = {} item['embed_out.weight'] = model.embed_out.weight item['final_layer_norm.weight'] = model.gpt_neox.final_layer_norm.weight item['final_layer_norm.bias'] = model.gpt_neox.final_layer_norm.bias torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt')) print('converted the lm_head layer.') ================================================ FILE: pretrained/prepare_pretrained.py ================================================ import os import argparse import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig DIR = os.path.dirname(os.path.abspath(__file__)) USE_AUTH_TOKEN = False # Load pretrained model from HuggingFace and save it to disk def prepare_pretrained(save_path, model_name, offload_dir=None): os.makedirs(save_path, exist_ok=True) print('loading model from HF...') config = AutoConfig.from_pretrained(model_name, use_auth_token=USE_AUTH_TOKEN) config.save_pretrained(save_path) tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=USE_AUTH_TOKEN) tokenizer.save_pretrained(save_path) # offload model from memory to disk if offload-dir is specified if offload_dir is not None: os.makedirs(offload_dir, exist_ok=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", offload_folder=offload_dir, use_auth_token=USE_AUTH_TOKEN) else: model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_auth_token=USE_AUTH_TOKEN) print('loaded model from HF...') print('converting the embedding layer...') item = {} item['embed_in.weight'] = model.gpt_neox.embed_in.weight torch.save(item, os.path.join(save_path, 'pytorch_embs.pt')) print('converted the embedding layer.') for i in range(len(model.gpt_neox.layers)): print(f'converting the {i}-th transformer layer...') torch.save(model.gpt_neox.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt')) print(f'converted the {i}-th transformer layer.') print('converting the lm_head layer...') item = {} item['embed_out.weight'] = model.embed_out.weight item['final_layer_norm.weight'] = model.gpt_neox.final_layer_norm.weight item['final_layer_norm.bias'] = model.gpt_neox.final_layer_norm.bias torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt')) print('converted the lm_head layer.') # python pretrained/prepare_pretrained.py --model-name EleutherAI/gpt-neox-125M --save-dir pretrained/files --offload-dir pretrained/files/offload def main(): parser = argparse.ArgumentParser(description='Convert HF checkpoints') parser.add_argument('--model-name', type=str, required=True, help='model-name') parser.add_argument('--save-dir', type=str, required=True, help='model-name') parser.add_argument('--offload-dir', type=str, default=None, help='directory to offload from memory') args = parser.parse_args() prepare_pretrained(args.save_dir, args.model_name, args.offload_dir) if __name__ == '__main__': main() ================================================ FILE: retrieval/README.md ================================================ # Retrieval-Enhanced Chatbot This is a demonstration of how to enhance a chatbot using Wikipedia. We'll be using [ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index](https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index). for this demo. Thank Christoph for providing this resource! In this demo, we'll be extending the approach of comparing and adding the adjacent `w` sentences to the matched sentence if their cosine similarity is larger than `w_th`. By doing so, we can provide the chatbot with a longer context, which may improve its performance. This demo combines both the above index and the chat model into one system ## Start the combined server To get started, we need to install some dependencies and download the Wikipedia index: 0. Install dependencies Install the necessary dependencies, including `torch`, `transformers`, `flask`, `faiss`, and `fastparquet`. 1. Open up wiki-server.py and set model_name_or_path to point to the path that contains the chat model 2. Start the retrieval server ```shell python wiki-server.py ``` The server will listen on port 7003. It will download the data sets from ChristophSchuhman. This may take a few minutes. 3. Test the full retrieval enhanced chatbot We now demonstrate both the wiki index and the GPT-NeoX-fine-tuned model. ```curl -X POST -H 'Content-Type: application/json' http://127.0.0.1:7003/inference -d '{ "prompt" : "where is zurich located?" }'``` Internally we first query the wiki index and generate a response using the provided model. To do this, We concatenate the retrieved information and the users' query into a prompt, encode it with a tokenizer, and generate a response using the chatbot model. The response should indicate the location of Zurich city. 4. To test just the retrieval functionality of the system you can can do the following. Curl works as well. ```python import requests endpoint = 'http://127.0.0.1:7003/search' res = requests.post(endpoint, json={ 'query': 'Where is Zurich?', 'k': 1, 'w': 5, 'w_th': 0.7, }) print(res.json()) ``` This should print the most relevant sentences about Zurich from Wikipedia. By increasing w and decreasing w_th, we can retrieve a longer context. ================================================ FILE: retrieval/__init__.py ================================================ ================================================ FILE: retrieval/wikipedia.py ================================================ # This file was adapted from ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index: # https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index/blob/main/wikiindexquery.py # # The original file was licensed under the Apache 2.0 license. import os from transformers import AutoTokenizer, AutoModel import faiss import numpy as np import pandas as pd DIR = os.path.dirname(os.path.abspath(__file__)) def mean_pooling(token_embeddings, mask): token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.) sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None] return sentence_embeddings def cos_sim_2d(x, y): norm_x = x / np.linalg.norm(x, axis=1, keepdims=True) norm_y = y / np.linalg.norm(y, axis=1, keepdims=True) return np.matmul(norm_x, norm_y.T) class WikipediaIndex: def __init__(self): path = os.path.join(DIR, '..', 'data', 'wikipedia-3sentence-level-retrieval-index', 'files') indexpath = os.path.join(path, 'knn.index') wiki_sentence_path = os.path.join(path, 'wikipedia-en-sentences.parquet') self._device = 'cuda' self._tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco') self._contriever = AutoModel.from_pretrained('facebook/contriever-msmarco').to(self._device) self._df_sentences = pd.read_parquet(wiki_sentence_path, engine='fastparquet') self._wiki_index = faiss.read_index(indexpath, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY) def search(self, query, k=1, w=5, w_th=0.5): inputs = self._tokenizer(query, padding=True, truncation=True, return_tensors='pt').to(self._device) outputs = self._contriever(**inputs) embeddings = mean_pooling(outputs[0], inputs['attention_mask']) query_vector = embeddings.cpu().detach().numpy().reshape(1, -1) distances, indices = self._wiki_index.search(query_vector, k) texts = [] for i, (dist, indice) in enumerate(zip(distances[0], indices[0])): text = self._df_sentences.iloc[indice]['text_snippet'] try: input_texts = [self._df_sentences.iloc[indice]['text_snippet']] for j in range(1, w+1): input_texts = [self._df_sentences.iloc[indice-j]['text_snippet']] + input_texts for j in range(1, w+1): input_texts = input_texts + [self._df_sentences.iloc[indice+j]['text_snippet']] inputs = self._tokenizer(input_texts, padding=True, truncation=True, return_tensors='pt').to(self._device) outputs = self._contriever(**inputs) embeddings = mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy() for j in range(1, w+1): if cos_sim_2d(embeddings[w-j].reshape(1, -1), embeddings[w].reshape(1, -1)) > w_th: text = self._df_sentences.iloc[indice-j]['text_snippet'] + text else: break for j in range(1, w+1): if cos_sim_2d(embeddings[w+j].reshape(1, -1), embeddings[w].reshape(1, -1)) > w_th: text += self._df_sentences.iloc[indice+j]['text_snippet'] else: break except Exception as e: print(e) texts.append(text) return texts ================================================ FILE: tools/README.md ================================================ # OpenChatKit Tools ## convert_to_hf_gptneox.py ## ml_load_benchmark.py The commands to run the model load benchmark tool is: ```shell $ python3 model_load_benchmark.py -i benchmark_input.json -o benchmark_results.json -d cuda:0 ``` ``` usage: model_load_benchmark.py [-h] -i INPUT -o OUTPUT [-d DEVICE] [-r REPEAT_INFER] Benchmark downloading, loading, and running an inferernce for a set of ML models. optional arguments: -h, --help show this help message and exit -i INPUT, --input INPUT Input JSON file containing models to be benchmark -o OUTPUT, --output OUTPUT Output JSON file with model benchmark results -d DEVICE, --device DEVICE Cuda device name, e.g. "cuda:0" -r REPEAT_INFER, --repeat-infer REPEAT_INFER Repeat inferrence for warm timings ``` The input file is a JSON file with the names and paths of the models to be tested. For example: ```JSON { "GPT-NeoXT-Chat-Base-20B": "togethercomputer/GPT-NeoXT-Chat-Base-20B", "Pythia-Chat-Base-7B": "togethercomputer/Pythia-Chat-Base-7B", "GPT-JT-Moderation-6B": "togethercomputer/GPT-JT-Moderation-6B", "GPT-JT-6B-v1": "togethercomputer/GPT-JT-6B-v1", "GPT-JT-6B-v0": "togethercomputer/GPT-JT-6B-v0" } ``` The output is a json file with the timings for: 1. tokenizer download time in seconds -- `tokenizer_download_sec` 2. tokenizer load time in seconds -- `tokenizer_load_sec` 3. model download time -- `model_download_sec` 5. model load to RAM time -- `model_load_to_ram_sec` 6. model transfer to GPU time -- `model_transfer_to_gpu_sec` 7. inference time (input is "hello, world!") -- `inference_sec` 8. total time (sum of all the above) -- `total_sec` 9. inference time from a warm start (the average of running inference `REPEAT_INFER` times) -- `inference_warm_sec` 10. model main memory footprint in MB -- `model_main_memory_MB` 11. model GPU memory footprint in MB -- `model_gpu_memory_MB` An example of the output is: ```JSON { "GPT-JT-6B-v1": { "tokenizer_download_sec": 1.52, "tokenizer_load_sec": 0.10, "model_download_sec": 124.70, "model_load_to_ram_sec": 127.81, "model_main_memory_MB": 12297.10, "model_transfer_to_gpu_sec": 3.29, "model_gpu_memory_MB": 12219.74, "inference_sec": 0.93, "inference_warm_sec": 0.047, "total_sec": 258.38 } } ``` ================================================ FILE: tools/benchmark_input.json ================================================ { "GPT-NeoXT-Chat-Base-20B": "togethercomputer/GPT-NeoXT-Chat-Base-20B", "Pythia-Chat-Base-7B": "togethercomputer/Pythia-Chat-Base-7B", "GPT-JT-Moderation-6B": "togethercomputer/GPT-JT-Moderation-6B", "GPT-JT-6B-v1": "togethercomputer/GPT-JT-6B-v1", "GPT-JT-6B-v0": "togethercomputer/GPT-JT-6B-v0" } ================================================ FILE: tools/convert_to_hf_gptneox.py ================================================ import torch import torch.nn as nn import argparse from transformers import GPTNeoXForCausalLM from transformers import AutoConfig, AutoTokenizer from transformers.modeling_utils import no_init_weights import os def create_empty_gptneox(config): import torch import torch.nn as nn _reset_parameters_linear = nn.Linear.reset_parameters def dummy(*args, **kargs): pass nn.Linear.reset_parameters = dummy # 1. disable init for faster initialization # 2. avoid tie token embeddings with lm_head, as we train them separately. with no_init_weights(_enable=True): model = GPTNeoXForCausalLM(config).eval() nn.Linear.reset_parameters = _reset_parameters_linear return model def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_per_stage=14): input_path = checkpoint_path assert n_stages * n_layer_per_stage >= len(model.gpt_neox.layers) # assert model.lm_head.weight.data is not model.transformer.wte.weight.data for i in range(n_stages): print(f'loading stage {i}') checkpoint = torch.load(os.path.join(input_path, f'prank_{i}_checkpoint.pt'), map_location=torch.device("cpu")) if i == 0: _tmp = {k[len(f"{0}."):]:v for k,v in checkpoint.items() if k.startswith(f"0.")} # torch.save(_tmp, os.path.join(output_path, f'pytorch_embs.pt')) model.gpt_neox.embed_in.weight.data[:] = _tmp['embed_in.weight'] for j in range(n_layer_per_stage): _tmp = {k[len(f"{j+1}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j+1}.")} if len(_tmp) == 0: break # torch.save(_tmp, os.path.join(output_path, f'pytorch_{j}.pt')) model.gpt_neox.layers[j].load_state_dict(_tmp) elif i == n_stages - 1: for j in range(n_layer_per_stage): _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} if len(_tmp) == 0: break # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt')) model.gpt_neox.layers[i*n_layer_per_stage + j].load_state_dict(_tmp) if i*n_layer_per_stage + j == len(model.gpt_neox.layers) - 1: j += 1 break _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} if len(_tmp) == 0: break # torch.save(_tmp, os.path.join(output_path, f'pytorch_lm_head.pt')) model.gpt_neox.final_layer_norm.weight.data[:] = _tmp['final_layer_norm.weight'] model.gpt_neox.final_layer_norm.bias.data[:] = _tmp['final_layer_norm.bias'] model.embed_out.weight.data[:] = _tmp['embed_out.weight'] if 'embed_out.bias' in _tmp: model.embed_out.bias.data[:] = _tmp['embed_out.bias'] else: for j in range(n_layer_per_stage): _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} if len(_tmp) == 0: break # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt')) model.gpt_neox.layers[i*n_layer_per_stage + j].load_state_dict(_tmp) return model if __name__ == '__main__': parser = argparse.ArgumentParser(description='Convert HF checkpoints') parser.add_argument('--config-name', type=str, default='EleutherAI/gpt-neox-20b', help='config-name') parser.add_argument('--ckpt-path', type=str, default=None, help='ckpt-path') parser.add_argument('--save-path', type=str, default=None, help='save-path') parser.add_argument('--n-stages', type=int, default=8, help='pipeline group size') parser.add_argument('--n-layer-per-stage', type=int, default=6, help='n layers per GPU device') parser.add_argument('--fp16', default=False, action='store_true') args = parser.parse_args() assert args.ckpt_path is not None assert args.save_path is not None os.makedirs(args.save_path, exist_ok=True) print('loading config...') config = AutoConfig.from_pretrained(args.config_name) print('loaded config.') print('loading tokenizer...') tokenizer = AutoTokenizer.from_pretrained(args.config_name) print('loaded tokenizer.') print('creating empty model...') model = create_empty_gptneox(config) if args.fp16: model = model.half() print('created empty model.') print('loading model ckpt...') load_decentralized_checkpoint( model, args.ckpt_path, n_stages=args.n_stages, n_layer_per_stage=args.n_layer_per_stage, ) print('loaded model ckpt.') print('saving HF model...') model.save_pretrained(args.save_path) print(f'saved HF model to `{args.save_path}`') config.save_pretrained(args.save_path) tokenizer.save_pretrained(args.save_path) ================================================ FILE: tools/convert_to_hf_llama.py ================================================ import os import argparse import torch import torch import torch.nn as nn from transformers import LlamaForCausalLM from transformers import AutoConfig, AutoTokenizer from transformers.modeling_utils import no_init_weights import os def create_emtpy_llama(config): import torch import torch.nn as nn _reset_parameters_linear = nn.Linear.reset_parameters def dummy(*args, **kargs): pass nn.Linear.reset_parameters = dummy # 1. disable init for faster initialization # 2. avoid tie token embeddings with lm_head, as we train them separately. with no_init_weights(_enable=True): model = LlamaForCausalLM(config).eval() nn.Linear.reset_parameters = _reset_parameters_linear return model def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_per_stage=16, ): input_path = checkpoint_path n_layers = len(model.model.layers) assert n_stages * n_layer_per_stage >= len(model.model.layers) # assert model.lm_head.weight.data is not model.transformer.wte.weight.data for i in range(n_stages): print(f'loading stage {i}') checkpoint = torch.load(os.path.join(input_path, f'prank_{i}_checkpoint.pt'), map_location=torch.device("cpu")) if i == 0: _tmp = {k[len(f"{0}."):]:v for k,v in checkpoint.items() if k.startswith(f"0.")} # torch.save(_tmp, os.path.join(output_path, f'pytorch_embs.pt')) model.model.embed_tokens.weight.data[:] = _tmp['embed_tokens.weight'] for j in range(n_layer_per_stage): _tmp = {k[len(f"{j+1}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j+1}.")} if len(_tmp) == 0: break # torch.save(_tmp, os.path.join(output_path, f'pytorch_{j}.pt')) ret = model.model.layers[j].load_state_dict(_tmp, strict=False) if len(ret.missing_keys): print('The following weight keys are missing:') print(ret.missing_keys) if len(ret.unexpected_keys): print('The following weight keys are unexpected:') print(ret.unexpected_keys) elif i == n_stages - 1: for j in range(n_layer_per_stage): if i*n_layer_per_stage + j == n_layers: break _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} if len(_tmp) == 0: break # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt')) ret = model.model.layers[i*n_layer_per_stage + j].load_state_dict(_tmp, strict=False) if len(ret.missing_keys): print('The following weight keys are missing:') print(ret.missing_keys) if len(ret.unexpected_keys): print('The following weight keys are unexpected:') print(ret.unexpected_keys) else: j += 1 _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} if len(_tmp) == 0: break # torch.save(_tmp, os.path.join(output_path, f'pytorch_lm_head.pt')) model.model.norm.weight.data[:] = _tmp['norm.weight'] if 'norm.bias' in _tmp: model.model.norm.bias.data[:] = _tmp['norm.bias'] model.lm_head.weight.data[:] = _tmp['lm_head.weight'] if 'lm_head.bias' in _tmp: model.lm_head.bias.data[:] = _tmp['lm_head.bias'] else: for j in range(n_layer_per_stage): _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} if len(_tmp) == 0: break # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt')) ret = model.model.layers[i*n_layer_per_stage + j].load_state_dict(_tmp, strict=False) if len(ret.missing_keys): print('The following weight keys are missing:') print(ret.missing_keys) if len(ret.unexpected_keys): print('The following weight keys are unexpected:') print(ret.unexpected_keys) return model if __name__ == '__main__': parser = argparse.ArgumentParser(description='Convert HF checkpoints') parser.add_argument('--config-name', type=str, default='togethercomputer/Llama-2-7B-32K-beta', help='config-name') parser.add_argument('--ckpt-path', type=str, default=None, help='ckpt-path') parser.add_argument('--save-path', type=str, default=None, help='save-path') parser.add_argument('--n-stages', type=int, default=8, help='pipeline group size') parser.add_argument('--n-layer-per-stage', type=int, default=4, help='n layers per GPU device') parser.add_argument('--fp16', default=False, action='store_true') args = parser.parse_args() assert args.ckpt_path is not None assert args.save_path is not None if not os.path.exists(args.save_path): os.mkdir(args.save_path) # LlamaForCausalLM LlamaConfig LlamaTokenizer print('loading config...') config = AutoConfig.from_pretrained(args.config_name) print('loaded config.') print('loading tokenizer...') tokenizer = AutoTokenizer.from_pretrained(args.config_name) print('loaded tokenizer.') print('creating empty model...') model = create_emtpy_llama(config) if args.fp16: model = model.half() print('created empty model.') print('loading model ckpt...') load_decentralized_checkpoint( model, args.ckpt_path, n_stages=args.n_stages, n_layer_per_stage=args.n_layer_per_stage, ) print('loaded model ckpt.') print('saving HF model...') model.save_pretrained(args.save_path) print(f'saved HF model to `{args.save_path}`') config.save_pretrained(args.save_path) tokenizer.save_pretrained(args.save_path) ================================================ FILE: tools/model_load_benchmark.py ================================================ import argparse import json import time import torch import torchvision import os import re import psutil from transformers import AutoTokenizer, AutoModelForCausalLM # Benchmark download, tokenize, load, inference time. def benchmark(model_dict: dict, device_name: str, repeat_infer: int): # Initialize the benchmark results dictionary results_dict = {} # Check that we have CUDA GPUs available before running the benchmark if not torch.cuda.is_available(): print("ERROR: CUDA GPUs are not available, benchmark not run") return results_dict device = torch.device(device_name) process = psutil.Process() print(f'Using device {device}') # Loop through the models to test for model_name, model_path in model_dict.items(): # purge unused cached memory torch.cuda.empty_cache() print(f"Testing model: {model_name}") # Measure the time it takes to download the tokenizer data and load the tokenizer tokenizer_download_start_time = time.time() tokenizer = AutoTokenizer.from_pretrained(model_path, force_download=True) tokenizer_download_end_time = time.time() tokenizer = None # Measure the time it takes to load the tokenizer tokenizer_load_start_time = time.time() tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer_load_end_time = time.time() tokenizer_load_sec = tokenizer_load_end_time - tokenizer_load_start_time tokenizer_download_sec = tokenizer_download_end_time - tokenizer_download_start_time - tokenizer_load_sec print(f"Testing model: {model_name} --- tokenizer download time = {tokenizer_download_sec:.3} sec") print(f"Testing model: {model_name} --- tokenize load time = {tokenizer_load_sec:.3} sec") # Measure the time it takes to download and load the model into main memory model_download_start_time = time.time() model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, torchscript=True, force_download=True) model_download_end_time = time.time() model = None # Measure the time it takes to load the model into main memory memory_used_main_start = process.memory_info().rss model_load_to_ram_start_time = time.time() model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, torchscript=True) model_load_to_ram_end_time = time.time() memory_used_main_end = process.memory_info().rss model_load_to_ram_sec = model_load_to_ram_end_time - model_load_to_ram_start_time model_download_sec = model_download_end_time - model_download_start_time - model_load_to_ram_sec model_main_memory_bytes = memory_used_main_end - memory_used_main_start print(f"Testing model: {model_name} --- model download time = {model_download_sec:.3} sec") print(f"Testing model: {model_name} --- model load to RAM time = {model_load_to_ram_sec:.3} sec") print(f"Testing model: {model_name} --- model main memory size = {model_main_memory_bytes} bytes") # Measure the time it takes to load the model from main memory to the GPU gpu_memory_start = torch.cuda.memory_allocated(device) model_xfer_to_gpu_start_time = time.time() model = model.to(device) model_xfer_to_gpu_end_time = time.time() gpu_memory_end = torch.cuda.memory_allocated(device) model_xfer_to_gpu_sec = model_xfer_to_gpu_end_time - model_xfer_to_gpu_start_time model_gpu_memory_bytes = gpu_memory_end - gpu_memory_start print(f"Testing model: {model_name} --- model transfer to GPU time = {model_xfer_to_gpu_sec:.3} sec") print(f"Testing model: {model_name} --- model GPU memory size = {model_gpu_memory_bytes} bytes") # Measure the time it takes to run inference from a cold start inference_start_time = time.time() inputs = tokenizer("Hello, world!", return_tensors="pt").to(device) outputs = model(**inputs) inference_end_time = time.time() inference_sec = inference_end_time - inference_start_time print(f"Testing model: {model_name} --- inference time = {inference_sec:.3} sec") # Measure the time it takes to run inference from a cold start inference_warm_start_time = time.time() for i in range(0, repeat_infer): inputs = tokenizer("Hello, world!", return_tensors="pt").to(device) outputs = model(**inputs) inference_warm_end_time = time.time() inference_warm_sec = (inference_warm_end_time - inference_warm_start_time) / float(repeat_infer) print(f"Testing model: {model_name} --- inference warm time = {inference_warm_sec:.3} sec") total_sec = tokenizer_download_sec + tokenizer_load_sec + model_download_sec + model_load_to_ram_sec + model_xfer_to_gpu_sec + inference_sec print(f"Testing model: {model_name} --- total time = {total_sec:.3} sec") # Add the results to the dictionary results_dict[model_name] = { "tokenizer_download_sec": tokenizer_download_sec, "tokenizer_load_sec": tokenizer_load_sec, "model_download_sec": model_download_sec, "model_load_to_ram_sec": model_load_to_ram_sec, "model_main_memory_MB": float(model_main_memory_bytes) / 1000000.0, "model_transfer_to_gpu_sec": model_xfer_to_gpu_sec, "model_gpu_memory_MB": float(model_gpu_memory_bytes) / 1000000.0, "inference_sec": inference_sec, "inference_warm_sec": inference_warm_sec, "total_sec": total_sec } # Unload the model model = None torch.cuda.empty_cache() return results_dict # Define the main function def main(input_file: str, output_file: str, device_name: str, repeat_infer: int): # Load the models to test from the input JSON file with open(input_file, "r") as f: model_dict = json.load(f) # Run the benchmark results_dict = benchmark(model_dict, device_name, repeat_infer) # Write the results to the JSON output file # use a regular expression to apply formatting to floatin point json_data = re.sub('"(.*?)":\s*(0\.0*\d{2}|\d+\.\d{2})\d*(,?\n)', '"\\1": \\2\\3', json.dumps(results_dict, indent=4)) with open(output_file, 'w') as f: f.write(json_data) if __name__ == "__main__": # Create an argument parser parser = argparse.ArgumentParser(description='Benchmark downloading, loading, and running an inferernce for a set of ML models.') parser.add_argument('-i', '--input', required=True, help='Input JSON file containing models to be benchmark') parser.add_argument('-o', '--output', required=True, help='Output JSON file with model benchmark results') parser.add_argument('-d', '--device', required=False, default='cuda:0', help='Cuda device name, e.g. "cuda:0"') parser.add_argument('-r', '--repeat-infer', required=False, default=30, help='Repeat inferrence for warm timings') # Parse the command line arguments args = parser.parse_args() # Process the data main(args.input, args.output, args.device, max(args.repeat_infer, 1)) ================================================ FILE: training/README.md ================================================ # OpenChatKit Training This directory contains code for training a chat model using OpenChatKit. The main training script is `finetune_GPT-NeoXT-Chat-Base-20B.sh`. To customize training, make a copy of the script and modify the arguments. ## Arguments Environment vars that should be set: ```bash export GLOO_SOCKET_IFNAME=lo # this interface should be consistent to `--net-interface` export NCCL_SOCKET_IFNAME=lo # this interface should be consistent to `--net-interface` export WANDB_NAME=gptj-test # wandb run name ``` The following arguments should be carefully set: - `--model-name`: The path of model ckpt sharded by layers. - `--tokenizer-name`: Usually the same to `--model-name`. You can also use HF's model name. - `--model-type`: Indicate the model type. {gptj}. More model types will be added soon. - `--num-layers`: Number of Transformer layers **for each GPU**. E.g. GPT-J has 28 layers, if we use two GPUs to form a pipeline, `--num-layers` should be 14. - `--embedding-dim`: The hidden size of the model. GPT-J-6B is 4096. This is used to create buffers. - `--dist-url`: URL of rank 0 worker (master). It is the same to all workers. And this URL should be accessible by all workers. For local training (single machine multiple GPUs), this can be like `--dist-url tcp://127.0.0.1:7033` - `--world-size`: The total number of workers. `world-size == pipeline-group-size * data-group-size` - `--pipeline-group-size`: Number of GPU workers for each pipeline - `--data-group-size`: Number of data parallel workers. Also the number of pipelines. - `--net-interface`: Network interface. Should be consistent with `GLOO_SOCKET_IFNAME` and `NCCL_SOCKET_IFNAME`. The following arguments can be tuned / changed: - `--train-log-backend `: How to log the training info. {print, loguru, wandb}. - `--optimizer`: Optimizer type. {adam, 8bit-adam} (8bit-adam requires `pip install bitsandbytes`) - `--load-pretrained-model`: Whether to load model weights. Usually `true`. - `--task-name`: The task name or the path of a `jsonl` file. For multi-task training separate task names by `,`. There is an optional sampling weight after each task name, separated by `:` (default is 1.0). Sampling weights will be normalized. E.g. it should be like `--task-name cot:0.1,/path_task0.jsonl:1.0,/path_task0.jsonl:1.0,/path_task0.jsonl:1.0`. The number after the colon indicates the sampling weight for the task during training. For example, `cot:0.1` means the `cot` task will be sampled with a weight of 0.1. - `--checkpoint-path`: Path to save fine-tuned checkpoints. - `--checkpoint-steps`: Save ckpt every `checkpoint-steps`. - `--total-steps`: Total number of steps for training. (This counts all `gradient-accumulate-step`s.) - `--warmup-steps`: LR warmup steps. - `--lr`: learning rate - `--seq-length`: sequence length - `--batch-size`: batch size for each GPU device (of each gradient accumulation step). - `--micro-batch-size`: micro batch size for pipeline parallelism. 1 works fine. - `--gradient-accumulate-step`: Accumulate gradients for several steps before updating parameters. This is another way to achieve large batch sizes when GPU memory is not enough. The following arguments usually do not change: - `--dp-backend`: {nccl, gloo}, default nccl. - `--dp-mode`: {allreduce}. - `--fp16`: Flag to enable FP16 mixed precision training. Should always adding it for the current impl. - `--pp-mode`: always `gpipe` - `--profiling`: {no-profiling, tidy_profiling}. `tidy_profiling` will generate profile jsons. ## Adding Your Own Data to the DATASETS To add your own data to the training process, you should create a `jsonl` file where each line is a JSON object representing a single training example. Once you have your `jsonl` file, you can include it in the `--task-name` argument with an appropriate sampling weight. For instance, if your file is located at `/path_to_your_data/your_data.jsonl` and you wish to give it a sampling weight of 0.5, you would add `/path_to_your_data/your_data.jsonl:0.5` to the `--task-name` argument. If you have any questions or need further assistance, please refer to the [OpenDataHub](https://github.com/togethercomputer/OpenDataHub) repository or contact us through our [website](https://www.together.ai/contact). ================================================ FILE: training/comm/__init__.py ================================================ ================================================ FILE: training/comm/comm_utils.py ================================================ from .torch_backend import * from .nccl_backend import * _DATA_PARALLEL_COMM = None _DATA_PARALLEL_RANK = None _DATA_PARALLEL_WORLD_SIZE = None _PIPELINE_PARALLEL_COMM = None _PIPELINE_PARALLEL_RANK = None _PIPELINE_PARALLEL_WORLD_SIZE = None _TENSOR_PARALLEL_COMM = None _TENSOR_PARALLEL_RANK = None _TENSOR_PARALLEL_WORLD_SIZE = None import threading _LOCK = threading.RLock() def get_lock(): return _LOCK def get_data_parallel_comm() -> NCCLCommunicator: assert _DATA_PARALLEL_COMM is not None return _DATA_PARALLEL_COMM def get_data_parallel_rank() -> int: assert _DATA_PARALLEL_RANK is not None return _DATA_PARALLEL_RANK def get_data_parallel_world_size() -> int: assert _DATA_PARALLEL_WORLD_SIZE is not None return _DATA_PARALLEL_WORLD_SIZE def get_pipeline_parallel_comm() -> NCCLCommunicator: assert _PIPELINE_PARALLEL_COMM is not None return _PIPELINE_PARALLEL_COMM def get_pipeline_parallel_rank() -> int: assert _PIPELINE_PARALLEL_RANK is not None return _PIPELINE_PARALLEL_RANK def get_pipeline_parallel_world_size() -> int: assert _PIPELINE_PARALLEL_WORLD_SIZE is not None return _PIPELINE_PARALLEL_WORLD_SIZE def get_megatron_tensor_parallel_comm() -> NCCLCommunicator: assert _TENSOR_PARALLEL_COMM is not None return _TENSOR_PARALLEL_COMM def get_megatron_tensor_parallel_rank() -> int: assert _TENSOR_PARALLEL_RANK is not None return _TENSOR_PARALLEL_RANK def get_megatron_tensor_parallel_world_size() -> int: assert _TENSOR_PARALLEL_WORLD_SIZE is not None return _TENSOR_PARALLEL_WORLD_SIZE def default_init(args): import datetime import time try: dist.destroy_process_group() # the first time will raise exception, so the following code is skipped. print('destroy comm, increase port for 1. (this could cause problem)') url = ':'.join(args.dist_url.split(':')[:-1]) port = int(args.dist_url.split(':')[-1]) + 1 args.dist_url = f"{url}:{port}" print(f"new master url: {args.dist_url}") except: pass dist.init_process_group(backend='gloo', timeout=datetime.timedelta(seconds=5*60), init_method=args.dist_url, world_size=args.world_size, rank=args.rank) def init_communicators(args): default_init(args) assert args.world_size == args.data_group_size * args.pipeline_group_size if args.world_size == args.data_group_size * args.pipeline_group_size: # We do the following hard code alignment of communication groups: # Suppose there are 8 instances (world_size), and 4 data parallel groups (data_group_size is 2), # Then there would be 2 pipeline parallel groups (pipeline_group_size is 4), then the groups will look like: # pipeline parallel: , # data parallel: , , , # assert args.world_size == args.data_group_size * args.pipeline_group_size global _DATA_PARALLEL_COMM global _PIPELINE_PARALLEL_COMM global _DATA_PARALLEL_RANK global _PIPELINE_PARALLEL_RANK global _DATA_PARALLEL_WORLD_SIZE global _PIPELINE_PARALLEL_WORLD_SIZE # We use pipeline parallel by default. _PIPELINE_PARALLEL_WORLD_SIZE = args.pipeline_group_size _PIPELINE_PARALLEL_RANK = args.rank % args.pipeline_group_size _PIPELINE_PARALLEL_COMM = NCCLCommunicator(_PIPELINE_PARALLEL_RANK, args.cuda_id, args.pipeline_group_size, "pipeline_group_"+str(args.rank // args.pipeline_group_size)) if args.data_group_size != 1: _DATA_PARALLEL_WORLD_SIZE = args.data_group_size _DATA_PARALLEL_RANK = args.rank // args.pipeline_group_size dp_backend = getattr(args, 'dp_backend', 'gloo') if dp_backend == 'nccl': _DATA_PARALLEL_COMM = NCCLCommunicator(_DATA_PARALLEL_RANK, args.cuda_id, args.data_group_size, "data_group_"+str(args.rank % args.pipeline_group_size)) elif dp_backend == 'gloo': for i in range(args.pipeline_group_size): ranks = [rank for rank in range(i, args.world_size, args.pipeline_group_size)] print(args.rank, ranks) data_group = torch.distributed.new_group(ranks, backend='gloo') if args.rank in ranks: def to_global_rank(dp_rank): rank = _PIPELINE_PARALLEL_RANK + dp_rank * args.pipeline_group_size # print(f"{dp_rank} --> {rank}") return rank _DATA_PARALLEL_COMM = TorchCommunicator( data_group, to_global_rank=to_global_rank, dp_rank=_DATA_PARALLEL_RANK, comm_group_size=args.data_group_size,) else: assert False print('comm init done!!') # elif args.world_size == args.data_group_size * args.tensor_group_size: # global _DATA_PARALLEL_COMM # global _TENSOR_PARALLEL_COMM # global _DATA_PARALLEL_RANK # global _TENSOR_PARALLEL_RANK # global _DATA_PARALLEL_WORLD_SIZE # global _TENSOR_PARALLEL_WORLD_SIZE # We use megatron tensor parallel by default. # _TENSOR_PARALLEL_WORLD_SIZE = args.tensor_group_size # _TENSOR_PARALLEL_RANK = args.rank % args.tensor_group_size # _TENSOR_PARALLEL_COMM = NCCLCommunicator(_TENSOR_PARALLEL_RANK, args.cuda_id, args.tensor_group_size, # "tensor_group_" + str(args.rank // args.tensor_group_size)) # if args.data_group_size != 1: # _DATA_PARALLEL_WORLD_SIZE = args.data_group_size # _DATA_PARALLEL_RANK = args.rank // args.tensor_group_size # _DATA_PARALLEL_COMM = NCCLCommunicator(_DATA_PARALLEL_RANK, args.cuda_id, args.data_group_size, # "data_group_" + str(args.rank % args.tensor_group_size)) else: print("Not supported yet") assert False def reinit_dp_communicator(args): print('###### reinit start #######') default_init(args) assert args.world_size == args.data_group_size * args.pipeline_group_size if args.world_size == args.data_group_size * args.pipeline_group_size: # We do the following hard code alignment of communication groups: # Suppose there are 8 instances (world_size), and 4 data parallel groups (data_group_size is 2), # Then there would be 2 pipeline parallel groups (pipeline_group_size is 4), then the groups will look like: # pipeline parallel: , # data parallel: , , , # assert args.world_size == args.data_group_size * args.pipeline_group_size global _DATA_PARALLEL_COMM global _PIPELINE_PARALLEL_COMM global _DATA_PARALLEL_RANK global _PIPELINE_PARALLEL_RANK global _DATA_PARALLEL_WORLD_SIZE global _PIPELINE_PARALLEL_WORLD_SIZE if args.data_group_size != 1: dp_backend = getattr(args, 'dp_backend', 'gloo') if dp_backend == 'nccl': raise Exception('NCCL cannot reinit.') elif dp_backend == 'gloo': for i in range(args.pipeline_group_size): ranks = [rank for rank in range(i, args.world_size, args.pipeline_group_size)] print(args.rank, ranks) data_group = torch.distributed.new_group(ranks, backend='gloo') if args.rank in ranks: def to_global_rank(dp_rank): rank = _PIPELINE_PARALLEL_RANK + dp_rank * args.pipeline_group_size # print(f"{dp_rank} --> {rank}") return rank _DATA_PARALLEL_COMM = TorchCommunicator( data_group, to_global_rank=to_global_rank, dp_rank=_DATA_PARALLEL_RANK, comm_group_size=args.data_group_size,) else: assert False print('######## dp comm reinit done!! ########') ================================================ FILE: training/comm/nccl_backend.py ================================================ import torch import numpy as np import cupy import cupy.cuda.nccl import torch.distributed as dist from typing import List def _type_torch_to_cupy(torch_type: torch.dtype): # print(torch_type) mappings = { torch.uint8: cupy.cuda.nccl.NCCL_UINT8, torch.int32: cupy.cuda.nccl.NCCL_INT32, torch.int64: cupy.cuda.nccl.NCCL_INT64, torch.int: cupy.cuda.nccl.NCCL_INT, torch.float16: cupy.cuda.nccl.NCCL_FLOAT16, torch.float32: cupy.cuda.nccl.NCCL_FLOAT32, torch.float64: cupy.cuda.nccl.NCCL_FLOAT64, torch.float: cupy.cuda.nccl.NCCL_FLOAT } return mappings[torch_type] class NCCLCommunicator: def __init__(self, comm_rank: int, cuda_id: int, comm_group_size: int, comm_name: str): self.comm_rank = comm_rank cupy.cuda.Device(cuda_id).use() self.comm_group_size = comm_group_size print("Initialize NCCLCommunicator: <", comm_name, ">; rank:", comm_rank) self.dist_store = dist.distributed_c10d._get_default_store() if self.comm_rank == 0: cuda_id = cupy.cuda.nccl.get_unique_id() # print(cuda_id) cuda_id_str = np.array(cuda_id).tobytes() self.dist_store.set('group-'+comm_name+'-unique-id', cuda_id_str) # print("Master put .") else: cuda_id_str = self.dist_store.get('group-'+comm_name+'-unique-id') comm_id = tuple(np.frombuffer(cuda_id_str, dtype=int)) # comm_id = cupy.cuda.nccl.get_unique_id() # print(comm_id) self.comm = cupy.cuda.nccl.NcclCommunicator(comm_group_size, comm_id, comm_rank) @staticmethod def barrier(): dist.barrier() def store_set(self, key, value): self.dist_store.set(key, value) def store_get(self, key): return self.dist_store.get(key) def send(self, tensor: torch.Tensor, dst: int, stream=cupy.cuda.Stream.null): # print("Send tensor of size:", torch.numel(tensor)) self.comm.send( tensor.data_ptr(), torch.numel(tensor), _type_torch_to_cupy(tensor.dtype), dst, stream.ptr ) def recv(self, tensor: torch.Tensor, src: int, stream=cupy.cuda.Stream.null): # print("Recv tensor of size:", torch.numel(tensor)) # print("mean:", torch.mean(tensor).item(), " std:", torch.std(tensor).item()) self.comm.recv( tensor.data_ptr(), torch.numel(tensor), _type_torch_to_cupy(tensor.dtype), src, stream.ptr ) def broadcast(self, tensor: torch.Tensor, src: int, stream=cupy.cuda.Stream.null): self.comm.bcast( tensor.data_ptr(), torch.numel(tensor), _type_torch_to_cupy(tensor.dtype), src, stream.ptr ) def reduce(self, tensor: torch.Tensor, dst: int, stream=cupy.cuda.Stream.null, op=cupy.cuda.nccl.NCCL_SUM): self.comm.reduce( tensor.data_ptr(), # force it to be in-place. tensor.data_ptr(), torch.numel(tensor), _type_torch_to_cupy(tensor.dtype), op, dst, stream.ptr ) def all_reduce(self, tensor: torch.Tensor, stream=cupy.cuda.Stream.null, op=cupy.cuda.nccl.NCCL_SUM): self.comm.allReduce( tensor.data_ptr(), tensor.data_ptr(), torch.numel(tensor), _type_torch_to_cupy(tensor.dtype), op, stream.ptr ) def scatter(self, tensor: torch.Tensor, scatter_list: List[torch.Tensor], src: int, stream=cupy.cuda.Stream.null): cupy.cuda.nccl.groupStart() if self.comm_rank == src: for i in range(self.comm_group_size): self.send( scatter_list[i], i, stream ) self.recv( tensor, src, stream ) cupy.cuda.nccl.groupEnd() def gather(self, tensor: torch.Tensor, gather_list: List[torch.Tensor], dst: int, stream=cupy.cuda.Stream.null): cupy.cuda.nccl.groupStart() if self.comm_rank == dst: for i in range(self.comm_group_size): self.recv( gather_list[i], i, stream ) self.send( tensor, dst, stream ) cupy.cuda.nccl.groupEnd() def all_to_all(self, output_tensor_list: List[torch.Tensor], input_tensor_list: List[torch.Tensor], stream=cupy.cuda.Stream.null): assert len(output_tensor_list) == self.comm_group_size and len(input_tensor_list) == self.comm_group_size cupy.cuda.nccl.groupStart() for i in range(self.comm_group_size): self.send(input_tensor_list[i], i, stream) self.recv(output_tensor_list[i], i, stream) cupy.cuda.nccl.groupEnd() def all_gather(self, tensor: torch.Tensor, output_tensor_list: List[torch.Tensor], stream=cupy.cuda.Stream.null ): assert len(output_tensor_list) == self.comm_group_size cupy.cuda.nccl.groupStart() for i in range(self.comm_group_size): self.send(tensor, i, stream) self.recv(output_tensor_list[i], i, stream) cupy.cuda.nccl.groupEnd() def all_reduce_opt(self, tensor: torch.Tensor, buffer: List[torch.Tensor], stream=cupy.cuda.Stream.null, caller=None): # First do all-to-all assert torch.numel(tensor.data) % self.comm_group_size == 0 chunk_size = torch.numel(tensor.data) // self.comm_group_size t_type = _type_torch_to_cupy(tensor.dtype) element_size = tensor.data.element_size() cupy.cuda.nccl.groupStart() for i in range(self.comm_group_size): self.comm.send(tensor.data_ptr()+i*chunk_size*element_size, chunk_size, t_type, i, stream.ptr) self.comm.recv(buffer[i].data_ptr(), chunk_size, t_type, i, stream.ptr) cupy.cuda.nccl.groupEnd() for i in range(1, self.comm_group_size): buffer[0] += buffer[i] cupy.cuda.nccl.groupStart() for i in range(self.comm_group_size): self.comm.send(buffer[0].data_ptr(), chunk_size, t_type, i, stream.ptr) self.comm.recv(tensor.data_ptr()+i*chunk_size*element_size, chunk_size, t_type, i, stream.ptr) cupy.cuda.nccl.groupEnd() ================================================ FILE: training/comm/torch_backend.py ================================================ import torch import torch.distributed as dist from typing import List class TorchCommunicator: def __init__(self, process_group, to_global_rank=lambda rank: rank, dp_rank=None, comm_group_size=None,): self.process_group = process_group self.to_global_rank = to_global_rank self.dp_rank = dp_rank self.comm_group_size = comm_group_size # @staticmethod def barrier(self): dist.barrier(group=self.process_group) def send(self, tensor: torch.Tensor, dst: int, stream=None): # print("Send tensor of size:", torch.numel(tensor)) if tensor.device == torch.device('cpu'): dist.send(tensor, self.to_global_rank(dst), group=self.process_group) else: dist.send(tensor.cpu(), self.to_global_rank(dst), group=self.process_group) def recv(self, tensor: torch.Tensor, src: int, stream=None): if tensor.device == torch.device('cpu'): dist.recv(tensor, self.to_global_rank(src), group=self.process_group) else: buffer = tensor.cpu() dist.recv(buffer, self.to_global_rank(src), group=self.process_group) tensor[:] = buffer.to(tensor.device) def isend(self, tensor: torch.Tensor, dst: int, stream=None): # print("Send tensor of size:", torch.numel(tensor)) if tensor.device == torch.device('cpu'): handler = dist.isend(tensor, self.to_global_rank(dst), group=self.process_group) else: handler = dist.isend(tensor.cpu(), self.to_global_rank(dst), group=self.process_group) return handler def irecv(self, tensor: torch.Tensor, src: int, stream=None): if tensor.device == torch.device('cpu'): handler = dist.irecv(tensor, self.to_global_rank(src), group=self.process_group) else: assert False buffer = tensor.cpu() handler = dist.irecv(buffer, self.to_global_rank(src), group=self.process_group) tensor[:] = buffer.to(tensor.device) return handler def broadcast(self, tensor: torch.Tensor, src: int, stream=None): if tensor.device == torch.device('cpu'): dist.broadcast(tensor, self.to_global_rank(src), group=self.process_group) else: buffer = tensor.cpu() dist.broadcast(buffer, self.to_global_rank(src), group=self.process_group) tensor[:] = buffer.to(tensor.device) def reduce(self, tensor: torch.Tensor, dst: int, stream=None, op=dist.ReduceOp.SUM): dist.reduce(tensor, self.to_global_rank(dst), group=self.process_group, op=op) def all_reduce(self, tensor: torch.Tensor, stream = None, op=dist.ReduceOp.SUM): buffer = tensor.cpu() dist.all_reduce(buffer, group=self.process_group, op=op) tensor[:] = buffer.to(tensor.device) def gather(self, tensor: torch.Tensor, gather_list: List[torch.Tensor], dst: int, stream=None): dist.gather(tensor, gather_list, self.to_global_rank(dst), group=self.process_group) def all_to_all(self, output_tensor_list: List[torch.Tensor], input_tensor_list: List[torch.Tensor], stream=None): dist.all_to_all(output_tensor_list, input_tensor_list, group=self.process_group) def all_gather(self, tensor: torch.Tensor, output_tensor_list: List[torch.Tensor], stream=None): dist.all_gather(output_tensor_list, tensor, group=self.process_group) ================================================ FILE: training/data_parallel/__init__.py ================================================ ================================================ FILE: training/data_parallel/dist_dp_allreduce.py ================================================ import torch.cuda from comm.comm_utils import * from .flatten_utils import flatten_params class AllReduceDP: def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True): self.flatten = flatten self.global_rank = args.rank self.dp_group_size = args.data_group_size self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') self.dp_comm = get_data_parallel_comm() self.dp_rank = get_data_parallel_rank() self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1) self.torch_optim_comp_stream = torch.cuda.default_stream(device=device) self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.allreduce_grad_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.module = module num_paras, element_size = self._compute_total_para_num() print("Total number of parameters: {}, element size: {}, total size {} MB." .format(num_paras, element_size, num_paras * element_size // 1024 // 1024)) if self.flatten: self.flatten_para = flatten_params(self.module.parameters()) print("Flattened parameter number: {}, element size: {}." .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size())) print("Flattened parameter grad number: {}, element size: {}." .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size())) assert optimizer is not None self.optimizer = optimizer if self.enable_tidy_profiling: self.global_rank = args.rank self.init_event = None self.init_time_stamp = None if self.flatten: self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False) else: self.allreduce_gradients_start_events = dict() self.allreduce_gradients_end_events = dict() for name, _ in self.module.named_parameters(): self.allreduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) self.allreduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) self.optimizer_step_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) def _compute_total_para_num(self): total_count = 0 element_size = 0 for para in self.module.parameters(): # print("Parameter: ", para.data.shape) total_count += torch.numel(para.data) element_size = para.element_size() return total_count, element_size def profile_mark_allreduce_start(self, name=None): if self.enable_tidy_profiling: if name is None: self.dp_comm_stream.record_event(self.allreduce_gradients_start_event) else: self.dp_comm_stream.record_event(self.allreduce_gradients_start_events[name]) def profile_mark_allreduce_end(self, name=None): if self.enable_tidy_profiling: if name: self.dp_comm_stream.record_event(self.allreduce_gradients_end_events[name]) def profile_mark_optimizer_step_start(self): if self.enable_tidy_profiling: self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event) def _allreduce_gradients(self): with torch.cuda.stream(self.dp_comm_stream): cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) self.dp_comm_stream.wait_event(self.backward_ready_event) if self.flatten: self.profile_mark_allreduce_start() self.dp_comm.all_reduce(self.flatten_para.grad, stream=cupy_dp_stream) self.profile_mark_allreduce_end() else: for name, para in self.module.named_parameters(): if para.grad is None: continue self.profile_mark_allreduce_start(name) self.dp_comm.all_reduce(para.grad, stream=cupy_dp_stream) self.profile_mark_allreduce_end(name) self.dp_comm_stream.record_event(self.allreduce_grad_ready_event) def optimizer_step(self): self._allreduce_gradients() with torch.cuda.stream(self.torch_optim_comp_stream): self.torch_optim_comp_stream.wait_event(self.allreduce_grad_ready_event) self.profile_mark_optimizer_step_start() self.optimizer.step() self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event) def set_time_stamp(self, init_time_stamp, init_event): self.init_event = init_event self.init_time_stamp = init_time_stamp def get_ts(self, event): return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 def profiling_data_parallel(self, init_time_stamp, init_event): self.set_time_stamp(init_time_stamp, init_event) profiling_log = [] if self.flatten: allreduce_slot = self.allreduce_gradients_start_event.elapsed_time(self.allreduce_grad_ready_event)*1e+3 allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.allreduce_gradients_start_event), "dur": allreduce_slot, "cname": "cq_build_passed", "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} # print(allreduce_log) profiling_log.append(allreduce_log) else: for name, para in self.module.named_parameters(): allreduce_slot = self.allreduce_gradients_start_events[name].elapsed_time( self.allreduce_gradients_end_events[name]) * 1e+3 allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.allreduce_gradients_start_events[name]), "dur": allreduce_slot, "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}} # print(allreduce_log) profiling_log.append(allreduce_log) optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3 optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp", "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"} # print(optimizer_log) profiling_log.append(optimizer_log) return profiling_log ================================================ FILE: training/data_parallel/dist_dp_central_ps.py ================================================ import torch.cuda from comm.comm_utils import * from .flatten_utils import flatten_params class CentralPSDP: def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True): self.flatten = flatten self.global_rank = args.rank self.dp_group_size = args.data_group_size self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') self.dp_comm = get_data_parallel_comm() self.dp_rank = get_data_parallel_rank() self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1) self.torch_optim_comp_stream = torch.cuda.default_stream(device=device) self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.broadcast_reduced_gradients_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.module = module num_paras, element_size = self._compute_total_para_num() print("Total number of parameters: {}, element size: {}, total size {} MB." .format(num_paras, element_size, num_paras * element_size // 1024 // 1024)) if self.flatten: self.flatten_para = flatten_params(self.module.parameters()) print("Flattened parameter number: {}, element size: {}." .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size())) print("Flattened parameter grad number: {}, element size: {}." .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size())) assert optimizer is not None self.optimizer = optimizer if self.enable_tidy_profiling: self.global_rank = args.rank self.init_event = None self.init_time_stamp = None if self.flatten: self.reduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False) self.reduce_gradients_end_event = torch.cuda.Event(enable_timing=True, blocking=False) self.broadcast_reduced_grad_start_event = torch.cuda.Event(enable_timing=True, blocking=False) else: self.reduce_gradients_start_events = dict() self.reduce_gradients_end_events = dict() self.broadcast_reduced_grad_start_events = dict() self.broadcast_reduced_grad_end_events = dict() for name, _ in self.module.named_parameters(): self.reduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) self.reduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) self.broadcast_reduced_grad_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) self.broadcast_reduced_grad_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) self.optimizer_step_start_event = torch.cuda.Event(enable_timing=True, blocking=False) def _compute_total_para_num(self): total_count = 0 element_size = 0 for para in self.module.parameters(): # print("Parameter: ", para.data.shape) total_count += torch.numel(para.data) element_size = para.element_size() return total_count, element_size def profile_mark_reduce_start(self, name=None): if self.enable_tidy_profiling: if name is None: self.dp_comm_stream.record_event(self.reduce_gradients_start_event) else: self.dp_comm_stream.record_event(self.reduce_gradients_start_events[name]) def profile_mark_reduce_end(self, name=None): if self.enable_tidy_profiling: if name is None: self.dp_comm_stream.record_event(self.reduce_gradients_end_event) else: self.dp_comm_stream.record_event(self.reduce_gradients_end_events[name]) def profile_mark_optimizer_step_start(self): if self.enable_tidy_profiling: self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event) def profile_mark_broadcast_start(self, name=None): if self.enable_tidy_profiling: if name is None: self.dp_comm_stream.record_event(self.broadcast_reduced_grad_start_event) else: self.dp_comm_stream.record_event(self.broadcast_reduced_grad_start_events[name]) def profile_mark_broadcast_end(self, name=None): if self.enable_tidy_profiling: if name: self.dp_comm_stream.record_event(self.broadcast_reduced_grad_end_events[name]) def _reduce_gradients(self): with torch.cuda.stream(self.dp_comm_stream): cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) self.dp_comm_stream.wait_event(self.backward_ready_event) if self.flatten: self.profile_mark_reduce_start() self.dp_comm.reduce(self.flatten_para.grad, dst=0, stream=cupy_dp_stream) self.profile_mark_reduce_end() else: for name, para in self.module.named_parameters(): self.profile_mark_reduce_start(name) self.dp_comm.reduce(para.grad, dst=0, stream=cupy_dp_stream) self.profile_mark_reduce_end(name) def _broadcast_reduced_gradients(self): with torch.cuda.stream(self.dp_comm_stream): cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) if self.flatten: self.profile_mark_broadcast_start() self.dp_comm.broadcast(self.flatten_para.grad, src=0, stream=cupy_dp_stream) self.profile_mark_broadcast_end() else: for name, para in self.module.named_parameters(): self.profile_mark_broadcast_start(name) self.dp_comm.broadcast(para.grad, src=0, stream=cupy_dp_stream) self.profile_mark_broadcast_end(name) self.dp_comm_stream.record_event(self.broadcast_reduced_gradients_ready_event) def optimizer_step(self): self._reduce_gradients() self._broadcast_reduced_gradients() with torch.cuda.stream(self.torch_optim_comp_stream): self.torch_optim_comp_stream.wait_event(self.broadcast_reduced_gradients_ready_event) self.profile_mark_optimizer_step_start() self.optimizer.step() self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event) def set_time_stamp(self, init_time_stamp, init_event): self.init_event = init_event self.init_time_stamp = init_time_stamp def get_ts(self, event): return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 def profiling_data_parallel(self, init_time_stamp, init_event): self.set_time_stamp(init_time_stamp, init_event) profiling_log = [] if self.flatten: reduce_slot = self.reduce_gradients_start_event.elapsed_time(self.reduce_gradients_end_event) * 1e+3 reduce_log = {"name": "opt_reduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.reduce_gradients_start_event), "dur": reduce_slot, "cname": "cq_build_passed", "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} # print(reduce_log) profiling_log.append(reduce_log) else: for name, para in self.module.named_parameters(): reduce_slot = self.reduce_gradients_start_events[name].elapsed_time( self.reduce_gradients_end_events[name]) * 1e+3 reduce_log = {"name": "opt_reduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.reduce_gradients_start_events[name]), "dur": reduce_slot, "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}} # print(reduce_log) profiling_log.append(reduce_log) optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3 optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp", "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"} # print(optimizer_log) profiling_log.append(optimizer_log) if self.flatten: broadcast_slot = self.broadcast_reduced_grad_start_event.elapsed_time( self.broadcast_reduced_gradients_ready_event) * 1e+3 broadcast_log = {"name": "opt_broadcast", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.broadcast_reduced_grad_start_event), "dur": broadcast_slot, "cname": "cq_build_passed", "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} profiling_log.append(broadcast_log) else: for name, para in self.module.named_parameters(): broadcast_slot = self.broadcast_reduced_grad_start_events[name].elapsed_time( self.broadcast_reduced_grad_end_events[name]) * 1e+3 broadcast_log = {"name": "opt_broadcast", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.broadcast_reduced_grad_start_events[name]), "dur": broadcast_slot, "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}} # print(broadcast_log) profiling_log.append(broadcast_log) return profiling_log ================================================ FILE: training/data_parallel/dist_dp_local.py ================================================ import torch.cuda import cupy from comm.comm_utils import * from .flatten_utils import flatten_params class LocalDP: def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True): flatten = True self.flatten = flatten self.global_rank = args.rank self.dp_group_size = args.data_group_size self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') self.dp_comm = get_data_parallel_comm() self.dp_rank = get_data_parallel_rank() self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1) self.torch_optim_comp_stream = torch.cuda.default_stream(device=device) self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.allreduce_grad_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.module = module num_paras, element_size = self._compute_total_para_num() print("Total number of parameters: {}, element size: {}, total size {} MB." .format(num_paras, element_size, num_paras * element_size // 1024 // 1024)) if self.flatten: self.flatten_para = flatten_params(self.module.parameters()) print("Flattened parameter number: {}, element size: {}." .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size())) print("Flattened parameter grad number: {}, element size: {}." .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size())) assert optimizer is not None self.optimizer = optimizer if self.enable_tidy_profiling: self.global_rank = args.rank self.init_event = None self.init_time_stamp = None if self.flatten: self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False) else: self.allreduce_gradients_start_events = dict() self.allreduce_gradients_end_events = dict() for name, _ in self.module.named_parameters(): self.allreduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) self.allreduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) self.optimizer_step_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) def _compute_total_para_num(self): total_count = 0 element_size = 0 for para in self.module.parameters(): # print("Parameter: ", para.data.shape) total_count += torch.numel(para.data) element_size = para.element_size() return total_count, element_size def profile_mark_allreduce_start(self, name=None): if self.enable_tidy_profiling: if name is None: self.dp_comm_stream.record_event(self.allreduce_gradients_start_event) else: self.dp_comm_stream.record_event(self.allreduce_gradients_start_events[name]) def profile_mark_allreduce_end(self, name=None): if self.enable_tidy_profiling: if name: self.dp_comm_stream.record_event(self.allreduce_gradients_end_events[name]) def profile_mark_optimizer_step_start(self): if self.enable_tidy_profiling: self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event) def allreduce_parameters(self): self._local_parameters_backup = [ p.data.clone() for p in self.module.parameters() ] torch.cuda.synchronize() self.dp_comm.barrier() with torch.cuda.stream(self.dp_comm_stream): cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) self.dp_comm_stream.wait_event(self.backward_ready_event) if self.flatten: self.profile_mark_allreduce_start() self.dp_comm.all_reduce(self.flatten_para.data, stream=cupy_dp_stream) self.flatten_para.data /= self.dp_group_size self.profile_mark_allreduce_end() else: for name, para in self.module.named_parameters(): self.profile_mark_allreduce_start(name) self.dp_comm.all_reduce(para.data, stream=cupy_dp_stream) para.data /= self.dp_group_size self.profile_mark_allreduce_end(name) self.dp_comm_stream.record_event(self.allreduce_grad_ready_event) torch.cuda.synchronize() self.dp_comm.barrier() def rollback_parameters(self): if not hasattr(self, '_local_parameters_backup'): return for p, p_local in zip(self.module.parameters(), self._local_parameters_backup): p.data[:] = p_local.data del self._local_parameters_backup def optimizer_step(self): # torch.cuda.synchronize() with torch.cuda.stream(self.torch_optim_comp_stream): self.torch_optim_comp_stream.record_event(self.allreduce_gradients_start_event) self.torch_optim_comp_stream.record_event(self.allreduce_grad_ready_event) self.torch_optim_comp_stream.wait_event(self.backward_ready_event) self.profile_mark_optimizer_step_start() self.optimizer.step() self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event) def set_time_stamp(self, init_time_stamp, init_event): self.init_event = init_event self.init_time_stamp = init_time_stamp def get_ts(self, event): return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 def profiling_data_parallel(self, init_time_stamp, init_event): self.set_time_stamp(init_time_stamp, init_event) profiling_log = [] if self.flatten: allreduce_slot = self.allreduce_gradients_start_event.elapsed_time(self.allreduce_grad_ready_event)*1e+3 allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.allreduce_gradients_start_event), "dur": allreduce_slot, "cname": "cq_build_passed", "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} # print(allreduce_log) profiling_log.append(allreduce_log) else: for name, para in self.module.named_parameters(): allreduce_slot = self.allreduce_gradients_start_events[name].elapsed_time( self.allreduce_gradients_end_events[name]) * 1e+3 allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.allreduce_gradients_start_events[name]), "dur": allreduce_slot, "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}} # print(allreduce_log) profiling_log.append(allreduce_log) optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3 optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp", "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"} # print(optimizer_log) profiling_log.append(optimizer_log) return profiling_log ================================================ FILE: training/data_parallel/dist_dp_sharded_ps.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torch.cuda from comm.comm_utils import * from .flatten_utils import flatten_params class ShardedPSDP: def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True): self.flatten = flatten self.global_rank = args.rank self.dp_group_size = args.data_group_size self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') self.dp_comm = get_data_parallel_comm() self.dp_rank = get_data_parallel_rank() self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1) self.torch_optim_comp_stream = torch.cuda.default_stream(device=device) self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.sync_gradients_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) self.module = module assert optimizer is not None self.optimizer = optimizer num_paras, element_size = self._compute_total_para_num() print("Total number of parameters: {}, element size: {}, total size {} MB." .format(num_paras, element_size, num_paras * element_size // 1024 // 1024)) assert self.flatten # self.para = list(self.module.parameters()) self.flatten_para = flatten_params(self.module.parameters(), self.dp_group_size) print("Flattened parameter number: {}, element size: {}." .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size())) print("Flattened parameter grad number: {}, element size: {}." .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size())) self.grad_buffer = self._declare_grad_buffer() if self.enable_tidy_profiling: self.global_rank = args.rank self.init_event = None self.init_time_stamp = None assert self.flatten self.sync_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False) self.optimizer_step_start_event = torch.cuda.Event(enable_timing=True, blocking=False) def _compute_total_para_num(self): total_count = 0 element_size = 0 for para in self.module.parameters(): # print("Parameter: ", para.data.shape) total_count += torch.numel(para.data) element_size = para.element_size() return total_count, element_size def _declare_grad_buffer(self): assert self.flatten_para.data.numel() % self.dp_group_size == 0 chunk_size = self.flatten_para.data.numel() // self.dp_group_size grad_buffer = [torch.zeros(chunk_size, device=self.flatten_para.device, dtype=self.flatten_para.dtype) for _ in range(self.dp_group_size)] return grad_buffer def profile_mark_sync_grad_start(self): if self.enable_tidy_profiling: self.dp_comm_stream.record_event(self.sync_gradients_start_event) def profile_mark_allreduce_end(self): pass def profile_mark_optimizer_step_start(self): if self.enable_tidy_profiling: self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event) def _sync_gradients(self): with torch.cuda.stream(self.dp_comm_stream): cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) self.dp_comm_stream.wait_event(self.backward_ready_event) assert self.flatten self.profile_mark_sync_grad_start() self.dp_comm.all_reduce_opt(self.flatten_para.grad, self.grad_buffer, stream=cupy_dp_stream) self.profile_mark_allreduce_end() self.dp_comm_stream.record_event(self.sync_gradients_ready_event) def optimizer_step(self): self._sync_gradients() with torch.cuda.stream(self.torch_optim_comp_stream): self.torch_optim_comp_stream.wait_event(self.sync_gradients_ready_event) self.profile_mark_optimizer_step_start() self.optimizer.step() self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event) def set_time_stamp(self, init_time_stamp, init_event): self.init_event = init_event self.init_time_stamp = init_time_stamp def get_ts(self, event): return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 def profiling_data_parallel(self, init_time_stamp, init_event): self.set_time_stamp(init_time_stamp, init_event) profiling_log = [] assert self.flatten allreduce_slot = self.sync_gradients_start_event.elapsed_time(self.sync_gradients_ready_event)*1e+3 allreduce_log = {"name": "opt_shardedPS_sync", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", "ts": self.get_ts(self.sync_gradients_start_event), "dur": allreduce_slot, "cname": "cq_build_passed", "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} # print(allreduce_log) profiling_log.append(allreduce_log) optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3 optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp", "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"} # print(optimizer_log) profiling_log.append(optimizer_log) return profiling_log ================================================ FILE: training/data_parallel/dist_dp_utils.py ================================================ from .dist_dp_allreduce import AllReduceDP from .dist_dp_sharded_ps import ShardedPSDP from .dist_dp_local import LocalDP def get_dp_module(args, device, module, optimizer): print("Data parallel implementation: ", args.dp_mode) if args.dp_mode == 'allreduce': return AllReduceDP(args, device, module, optimizer, flatten=False) # flatten gradient is not compatible with fp16 now elif args.dp_mode == 'local': return LocalDP(args, device, module, optimizer, flatten=False) elif args.dp_mode == 'sharded_ps': return ShardedPSDP(args, device, module, optimizer, flatten=False) else: print("Not recognize this data parallel mode.") assert False ================================================ FILE: training/data_parallel/flatten_utils.py ================================================ import torch def _assert_contiguous(tensors): data_ptr = None for t in tensors: if data_ptr is not None: assert t.data_ptr() == data_ptr data_ptr = t.data_ptr() + t.numel() * t.element_size() def flatten_params(param_set, chunk=None): params = [p for p in param_set] weights = [p.data for p in params] grads = [p.grad.data if p.grad is not None else torch.zeros_like(p.data) for p in params] sizes = [p.numel() for p in params] total_size = sum(sizes) if chunk: total_size = ((total_size+chunk-1)//chunk)*chunk flatten_weights_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device) flatten_grads_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device) flatten_weights_storage = flatten_weights_tensor.storage() flatten_grads_storage = flatten_grads_tensor.storage() def set_storage(param, weight_storage, grad_storage, storage_offset): with torch.no_grad(): z = torch.zeros_like(param.data) z.set_(weight_storage, storage_offset, param.shape) param.data = z t = torch.zeros_like(param.data) t.set_(grad_storage, storage_offset, param.shape) param.grad = t offset = 0 for i in range(len(params)): flatten_weights_tensor[offset: offset + weights[i].numel()] = weights[i].reshape(-1) flatten_grads_tensor[offset: offset + grads[i].numel()] = grads[i].reshape(-1) set_storage(params[i], flatten_weights_storage, flatten_grads_storage, offset) offset += sizes[i] weight_tensors = [p.data for p in params] grad_tensors = [p.grad.data for p in params] _assert_contiguous(weight_tensors) _assert_contiguous(grad_tensors) with torch.no_grad(): flatten_para = torch.nn.Parameter(flatten_weights_tensor, requires_grad=False) flatten_para.grad = flatten_grads_tensor return flatten_para def flatten_tensors(tensor_set, chunk=None): tensors = [p for p in tensor_set] weights = [p.data for p in tensors] sizes = [p.numel() for p in tensors] total_size = sum(sizes) if chunk: total_size = ((total_size+chunk-1)//chunk)*chunk flatten_weights_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device) flatten_weights_storage = flatten_weights_tensor.storage() def set_storage(param, weight_storage, storage_offset): with torch.no_grad(): z = torch.zeros_like(param.data) z.set_(weight_storage, storage_offset, param.shape) param.data = z offset = 0 for i in range(len(tensors)): flatten_weights_tensor[offset: offset + weights[i].numel()] = weights[i].reshape(-1) set_storage(tensors[i], flatten_weights_storage, offset) offset += sizes[i] return flatten_weights_tensor ================================================ FILE: training/dist_clm_train.py ================================================ import argparse import time import random import numpy as np import torch import torch.autograd.profiler as profiler from tasks.data_loaders.data_utils import get_train_data_loader, get_eval_data_loader from modules.utils import gpt_loss_func from modules.tokenizer import build_tokenizer from pipeline_parallel.dist_pp_utils import get_pp_module from transformers import AutoConfig import datasets from utils.dist_args_utils import * from utils.dist_checkpoint_utils import * from utils.logging_utils import * from utils.event_report import * from comm.comm_utils import * from utils.upload_manager import * def test_loop(args, pipe, device, test_data_loader): if test_data_loader is None: return print('testing starts.....') pipe.model.eval() if get_pipeline_parallel_rank() == args.pipeline_group_size - 1: def _lm_pred_func(x, y): loss_fct = torch.nn.CrossEntropyLoss(reduction='none') logits = x[:, :-1, :].contiguous().float() labels = y[:, 1:].contiguous() loss = loss_fct(logits.transpose(-1, -2), labels).mean(1).detach().cpu() return loss loss_list = [] for i, data in enumerate(test_data_loader): if args.evaluation_num_batch is not None and i >= args.evaluation_num_batch: break input_ids = data['input_ids'].to(device) labels = input_ids.clone() pipe.infer_iter(input_ids, labels, output_=loss_list, pred_func=_lm_pred_func) loss = torch.tensor(loss_list).mean() ppls = torch.exp(loss) metric = {"valid.perplexity": ppls.item(), "valid.loss": loss.item()} print(metric) train_log( metric, step=pipe.global_step, ) else: for i, data in enumerate(test_data_loader): if args.evaluation_num_batch is not None and i >= args.evaluation_num_batch: break input_ids = data['input_ids'].to(device) labels = input_ids.clone() current_iter_time = pipe.infer_iter(input_ids, labels) pipe.model.train() def train_loop(args, pipe, device, train_data_loader, test_data_loader, steps_per_epoch): print('training starts......') event_reporter = EventReporter(host=args.event_host, auth_token=args.event_auth_token, job_id=args.job_id) pipe.model.train() # Flag .training to True to enable Dropout use_dp = (args.world_size != args.pipeline_group_size) if use_dp: # dp_comm = get_data_parallel_comm() dp_rank = get_data_parallel_rank() dp_size = get_data_parallel_world_size() else: dp_rank = 0 dp_size = 1 pp_comm = get_pipeline_parallel_comm() stop_flag = torch.zeros(1, dtype=torch.int64).to(device) input_ids = torch.zeros( [args.batch_size, args.seq_length], dtype=torch.int64 ).to(device) do_sync_before_save = (args.dp_mode in ['local'] and use_dp) # Get the number of model parameters for the model param_count = torch.zeros(1, dtype=torch.int64).to(device) local_param_count = sum(p.numel() for p in pipe.model.parameters()) param_count.data[:] = local_param_count pp_comm.reduce(param_count, 0) if get_pipeline_parallel_rank() == 0 and dp_rank == 0: print(f"Training steps: total_steps={args.total_steps}, steps_per_epoch={steps_per_epoch}, steps_per_checkpoint={args.checkpoint_steps}") upload_checkpoints_enabled = args.checkpoint_upload_prefix is not None upload_manager = UploadManager(aws_endpoint_url = args.aws_endpoint_url, aws_access_key_id = args.aws_access_key_id, aws_secret_access_key = args.aws_secret_access_key, aws_session_token = args.aws_session_token, aws_region = args.aws_region, event_reporter = event_reporter, n_stages = args.pipeline_group_size) if event_reporter is not None: # Get the number of tokens in the dataset token_count = train_data_loader.dataset.get_dataset_token_count() # Report training start event_reporter.report(object=EventReporter.OBJECT_FINE_TUNE, message=f"Training started for model {args.model_name}", event_type=EventReporter.EVENT_TYPE_TRAINING_START, param_count=param_count.item(), token_count=token_count, requires_is_enabled=False) for i, data in enumerate(train_data_loader): # if i < pipe.global_step: # continue if use_dp: get_data_parallel_comm().broadcast(stop_flag, 0) pp_comm.broadcast(stop_flag, 0) if stop_flag.item() == 1: break input_ids_global = data['input_ids'].to(torch.int64).to(device) input_ids_list = input_ids_global.chunk(dp_size) if use_dp: for j in range(1, dp_size): get_data_parallel_comm().send( input_ids_list[j], j, ) input_ids = input_ids_list[0] pp_comm.broadcast(input_ids, 0) labels = input_ids.clone() current_iter_time = pipe.sgd_iter(input_ids, labels, loss_func=gpt_loss_func) if event_reporter is not None and (pipe.global_step >= args.total_steps or pipe.global_step % steps_per_epoch == 0): event_reporter.report(object=EventReporter.OBJECT_FINE_TUNE, message=f"Epoch completed, at step {pipe.global_step}", event_type=EventReporter.EVENT_TYPE_EPOCH_COMPLETE, requires_is_enabled=False) if args.evaluation_steps > 0 and pipe.global_step % args.evaluation_steps == 0: test_loop(args, pipe, device, test_data_loader) if pipe.global_step >= args.total_steps or pipe.global_step % args.checkpoint_steps == 0: if do_sync_before_save: pipe.dp_optim.allreduce_parameters() if dp_rank == 0: checkpoint_step_path = save_checkpoint(pipe, args) if upload_checkpoints_enabled: upload_manager.add_task(directory=checkpoint_step_path, checkpoint_upload_prefix=args.checkpoint_upload_prefix, step=pipe.global_step) if do_sync_before_save: pipe.dp_optim.rollback_parameters() if pipe.global_step >= args.total_steps: stop_flag.data[:] = 1 if upload_checkpoints_enabled: upload_manager.wait() elif get_pipeline_parallel_rank() == 0: while True: get_data_parallel_comm().broadcast(stop_flag, 0) pp_comm.broadcast(stop_flag, 0) if stop_flag.item() == 1: break get_data_parallel_comm().recv( input_ids, 0, ) pp_comm.broadcast(input_ids, 0) labels = input_ids.clone() current_iter_time = pipe.sgd_iter(input_ids, labels, loss_func=gpt_loss_func) if args.evaluation_steps > 0 and pipe.global_step % args.evaluation_steps == 0: test_loop(args, pipe, device, test_data_loader) if pipe.global_step >= args.total_steps or pipe.global_step % args.checkpoint_steps == 0: if do_sync_before_save: pipe.dp_optim.allreduce_parameters() if dp_rank == 0: save_checkpoint(pipe, args) if do_sync_before_save: pipe.dp_optim.rollback_parameters() elif get_pipeline_parallel_rank() == args.pipeline_group_size - 1: while True: pp_comm.broadcast(stop_flag, 0) if stop_flag.item() == 1: break pp_comm.broadcast(input_ids, 0) labels = input_ids.clone() current_iter_time = pipe.sgd_iter(input_ids, labels, loss_func=gpt_loss_func) # lm loss func if args.evaluation_steps > 0 and pipe.global_step % args.evaluation_steps == 0: test_loop(args, pipe, device, test_data_loader) if pipe.global_step >= args.total_steps or pipe.global_step % args.checkpoint_steps == 0: if do_sync_before_save: pipe.dp_optim.allreduce_parameters() if dp_rank == 0: save_checkpoint(pipe, args) pipe.save_on_disk(args.checkpoint_path) if do_sync_before_save: pipe.dp_optim.rollback_parameters() else: while True: pp_comm.broadcast(stop_flag, 0) if stop_flag.item() == 1: break pp_comm.broadcast(input_ids, 0) current_iter_time = pipe.sgd_iter(None, None) if args.evaluation_steps > 0 and pipe.global_step % args.evaluation_steps == 0: test_loop(args, pipe, device, test_data_loader) if pipe.global_step >= args.total_steps or pipe.global_step % args.checkpoint_steps == 0: if do_sync_before_save: pipe.dp_optim.allreduce_parameters() if dp_rank == 0: save_checkpoint(pipe, args) if do_sync_before_save: pipe.dp_optim.rollback_parameters() # Compute the total number of training steps, steps per epoch, and steps per # checkpoint def calculate_training_steps(args, train_data_loader) -> int: total_steps = 0 steps_per_epoch = 0 steps_per_checkpoint = 0 token_count = train_data_loader.dataset.get_dataset_token_count() # Check the inputs to calculate the total steps if args.batch_size is None or args.world_size is None or args.pipeline_group_size is None or token_count is None or args.seq_length is None: print("Missing required arguments for calculating total steps based on epochs.") sys.exit(1) global_batch_size = (args.batch_size * args.world_size + args.pipeline_group_size - 1) // args.pipeline_group_size tokens_per_batch = global_batch_size * args.seq_length steps_per_epoch = (token_count + tokens_per_batch - 1) // tokens_per_batch if args.total_steps is not None: if args.nepochs is not None: print("WARNING: total_steps ({args.toal_steps}) supercedes nepochs ({args.nepochs}).") total_steps = args.total_steps elif args.nepochs is not None: total_steps = steps_per_epoch * args.nepochs else: total_steps = len(train_data_loader) # Set the minimum number of total steps if total_steps < 10: total_steps = 10 # Ensure that the steps per epoch are consistent with total steps # Note: This does not strictly follow the definition of an epoch. It just # approximately distributes the reporting of epochs over the total number of # steps. if args.nepochs is not None: steps_per_epoch = (total_steps + args.nepochs - 1) // args.nepochs # clamp steps_per_epoch to [1, total_steps] if steps_per_epoch > total_steps: steps_per_epoch = total_steps if steps_per_epoch < 1: steps_per_epoch = 1 # Set the number of steps per epoch based on user input. if args.checkpoint_steps is not None and args.checkpoint_steps > 0: steps_per_checkpoint = args.checkpoint_steps elif args.num_checkpoints is not None and args.num_checkpoints > 0: steps_per_checkpoint = (total_steps + args.num_checkpoints - 1) // args.num_checkpoints else: steps_per_checkpoint = total_steps # Clamp steps_per_checkpoint to [1, total_steps] if steps_per_checkpoint > total_steps: steps_per_checkpoint = total_steps if steps_per_checkpoint < 1: steps_per_checkpoint = 1 # Set the args base on what we computed above args.total_steps = total_steps args.checkpoint_steps = steps_per_checkpoint return steps_per_epoch def main(): parser = argparse.ArgumentParser(description='Gpipe-GPT') add_device_arguments(parser) add_torch_distributed_arguments(parser) add_model_arguments(parser) add_task_arguments(parser) add_training_hyper_parameter_arguments(parser) add_mixed_precision_arguments(parser) add_parallel_schema_arguments(parser) add_entry_reporter_arguments(parser) parser.add_argument('--model-name', type=str, default='gpt2', metavar='S', help='model name or path') parser.add_argument('--tokenizer-name', type=str, default='gpt2', metavar='S', help='tokenizer name or path') parser.add_argument('--model-type', type=str, default='gpt2', metavar='S', help='model name or path') parser.add_argument('--checkpoint-path', type=str, default='model_checkpoints/gpt2') parser.add_argument('--task-name', type=str, default='cot', metavar='S', help='task name') parser.add_argument('--warmup-steps', type=int, default=0, help='-') parser.add_argument('--train-warmup-steps', type=int, default=0, help='-') parser.add_argument('--nepochs', type=int, default=None, help='-') parser.add_argument('--total-steps', type=int, default=None, help='-') parser.add_argument('--load-pretrained-model', type=lambda x: x.lower()=='true', default=True, metavar='S', help='load pretrained model or not.') parser.add_argument('--load-checkpoint', type=lambda x: x.lower()=='true', default=True, metavar='S', help='load pretrained model or not.') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--profiling', type=str, default='no-profiling', metavar='S', help='enable which profiling? default: tidy mode') parser.add_argument('--trace-postfix', type=str, default='default', metavar='S', help='postfix of the tracing file name.') parser.add_argument('--evaluation-steps', type=int, default=0, metavar='S', help='every x steps, do evaluation. (0 means do not do evaluation)') parser.add_argument('--evaluation-data', type=str, default=None, help="path of eval data in jsonl") parser.add_argument('--evaluation-num-batch', type=int, default=None, help="for debug purpose, only eval the first several batch.") parser.add_argument('--checkpoint-steps', type=int, default=0, metavar='S', help='every x steps, save checkpoint. (0 means do not save checkpoint)') parser.add_argument('--num-checkpoints', type=int, default=0, metavar='S', help='number of checkpoints to save') parser.add_argument('--net-interface', type=str, default='lo', metavar='S', help='net_interface') parser.add_argument('--job-id', type=str, default="0", metavar='S', help='an uuid') # Add AWS arguments for uploading checkpoints to S3 parser.add_argument('--checkpoint-upload-prefix', default=None, help='S3 bucket name') add_aws_arguments(parser) args = parser.parse_args() aws_process_args(args) torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) if args.use_cuda: assert (torch.cuda.is_available()) device = torch.device('cuda', args.cuda_id) else: device = torch.device('cpu') init_communicators(args) use_dp = (args.world_size != args.pipeline_group_size) if use_dp: dp_comm = get_data_parallel_comm() dp_rank = get_data_parallel_rank() dp_size = get_data_parallel_world_size() else: dp_rank = 0 dp_size = 1 config = AutoConfig.from_pretrained(args.model_name) # num layer globally if hasattr(config, 'num_hidden_layers'): args.max_layers = config.num_hidden_layers elif hasattr(config, 'num_layers'): args.max_layers = config.num_layers else: args.max_layers = config.n_layer tokenizer = build_tokenizer(args) tokenizer.model_max_length = args.seq_length config.max_position_embeddings = args.seq_length # config.vocab_size = tokenizer.vocab_size config.bos_token_id = tokenizer.bos_token_id config.eos_token_id = tokenizer.eos_token_id config.pad_token_id = tokenizer.pad_token_id print("token vocab size:", config.vocab_size) train_data_loader = get_train_data_loader(args, tokenizer) if args.evaluation_data is not None and dp_rank == 0: test_data_loader = get_eval_data_loader(args, tokenizer) else: test_data_loader = None # calculate total steps steps_per_epoch = calculate_training_steps(args, train_data_loader) use_dp = (args.world_size != args.pipeline_group_size) if use_dp: print("Running ", args.pp_mode, " with data parallel.") else: print("Running ", args.pp_mode, " without data parallel.") pipe = get_pp_module(args, config, device, use_dp) if args.load_checkpoint: load_checkpoint(pipe, args) if args.fp16: pipe.optimizer.reload_model_params() if args.profiling == 'no-profiling': train_loop(args, pipe, device, train_data_loader, test_data_loader, steps_per_epoch) else: prefix = './trace_json/gpt3_' + args.pp_mode if use_dp: prefix = prefix + '_' + args.dp_mode trace_file = prefix + get_learning_arguments_str(args) + get_model_arguments_str(args) + \ get_dist_arguments_str(args) + get_mixed_precision_arguments_str(args) + '_' + \ args.profiling + '_' + args.trace_postfix + '.json' if args.profiling == 'tidy_profiling': try: train_loop(args, pipe, device, train_data_loader, test_data_loader, steps_per_epoch) except Exception as e: raise e print(get_pipeline_parallel_rank(), e) pipe.export_profiling_result(filename=trace_file) elif args.profiling == 'pytorch_profiling': with profiler.profile(profile_memory=True, use_cuda=args.use_cuda) as prof: train_loop(args, pipe, device, train_data_loader, test_data_loader, steps_per_epoch) print(prof.key_averages().table()) prof.export_chrome_trace(trace_file) else: print("No recognized profiler?") assert False print(get_pipeline_parallel_rank(), 'finished.') if __name__ == '__main__': main() ================================================ FILE: training/dist_prefixlm_train.py ================================================ import argparse import time import random import numpy as np import torch import torch.autograd.profiler as profiler from tasks.data_loaders.data_utils import get_ul2r_train_data_loader from modules.utils import gpt_loss_func from modules.tokenizer import build_tokenizer from pipeline_parallel.dist_pp_utils import get_pp_module from transformers import AutoConfig import datasets from utils.dist_args_utils import * from utils.dist_checkpoint_utils import * from utils.logging_utils import * from comm.comm_utils import * def test_loop(args, pipe, device, test_data_loader): print("no impl for testing, skip.") def train_loop(args, pipe, device, train_data_loader, test_data_loader): print('training starts......') pipe.model.train() # Flag .training to True to enable Dropout use_dp = (args.world_size != args.pipeline_group_size) if use_dp: # dp_comm = get_data_parallel_comm() dp_rank = get_data_parallel_rank() dp_size = get_data_parallel_world_size() else: dp_rank = 0 dp_size = 1 pp_comm = get_pipeline_parallel_comm() stop_flag = torch.zeros(1, dtype=torch.int64).to(device) input_ids = torch.zeros( [args.batch_size, args.seq_length], dtype=torch.int64 ).to(device) prefix_masks = torch.zeros( [args.batch_size, args.seq_length], dtype=torch.uint8 ).to(device) do_sync_before_save = (args.dp_mode in ['local'] and use_dp) if get_pipeline_parallel_rank() == 0 and dp_rank == 0: for i, data in enumerate(train_data_loader): if i < pipe.global_step: continue if use_dp: get_data_parallel_comm().broadcast(stop_flag, 0) pp_comm.broadcast(stop_flag, 0) if stop_flag.item() == 1: break input_ids_global = data['input_ids'].to(torch.int64).to(device) prefix_masks_global = data['prefix_masks'].to(torch.uint8).to(device) input_ids_list = input_ids_global.chunk(dp_size) prefix_masks_list = prefix_masks_global.chunk(dp_size) if use_dp: for j in range(1, dp_size): get_data_parallel_comm().send( input_ids_list[j], j, ) get_data_parallel_comm().send( prefix_masks_list[j], j, ) input_ids = input_ids_list[0] prefix_masks = prefix_masks_list[0] pp_comm.broadcast(input_ids, 0) pp_comm.broadcast(prefix_masks, 0) labels = input_ids.clone() current_iter_time = pipe.sgd_iter( input_ids, labels, aux_input_data={'prefix_masks': prefix_masks}, loss_func=gpt_loss_func ) if args.evaluation_steps > 0 and pipe.global_step % args.evaluation_steps == 0: test_loop(args, pipe, device, test_data_loader) if pipe.global_step % args.checkpoint_steps == 0: if do_sync_before_save: pipe.dp_optim.allreduce_parameters() if dp_rank == 0: save_checkpoint(pipe, args) if do_sync_before_save: pipe.dp_optim.rollback_parameters() if pipe.global_step >= args.total_steps: stop_flag.data[:] = 1 elif get_pipeline_parallel_rank() == 0: while True: get_data_parallel_comm().broadcast(stop_flag, 0) pp_comm.broadcast(stop_flag, 0) if stop_flag.item() == 1: break get_data_parallel_comm().recv( input_ids, 0, ) get_data_parallel_comm().recv( prefix_masks, 0, ) pp_comm.broadcast(input_ids, 0) pp_comm.broadcast(prefix_masks, 0) labels = input_ids.clone() current_iter_time = pipe.sgd_iter( input_ids, labels, aux_input_data={'prefix_masks': prefix_masks}, loss_func=gpt_loss_func) if args.evaluation_steps > 0 and pipe.global_step % args.evaluation_steps == 0: test_loop(args, pipe, device, test_data_loader) if pipe.global_step % args.checkpoint_steps == 0: if do_sync_before_save: pipe.dp_optim.allreduce_parameters() if dp_rank == 0: save_checkpoint(pipe, args) if do_sync_before_save: pipe.dp_optim.rollback_parameters() elif get_pipeline_parallel_rank() == args.pipeline_group_size - 1: while True: pp_comm.broadcast(stop_flag, 0) if stop_flag.item() == 1: break pp_comm.broadcast(input_ids, 0) pp_comm.broadcast(prefix_masks, 0) labels = input_ids.clone() labels[prefix_masks.bool()] = -100 # mask prefix part current_iter_time = pipe.sgd_iter( input_ids, labels, loss_func=gpt_loss_func, aux_input_data={'prefix_masks': prefix_masks} ) if args.evaluation_steps > 0 and pipe.global_step % args.evaluation_steps == 0: test_loop(args, pipe, device, test_data_loader) if pipe.global_step % args.checkpoint_steps == 0: if do_sync_before_save: pipe.dp_optim.allreduce_parameters() if dp_rank == 0: save_checkpoint(pipe, args) pipe.save_on_disk(args.checkpoint_path) if do_sync_before_save: pipe.dp_optim.rollback_parameters() else: while True: pp_comm.broadcast(stop_flag, 0) if stop_flag.item() == 1: break pp_comm.broadcast(input_ids, 0) pp_comm.broadcast(prefix_masks, 0) current_iter_time = pipe.sgd_iter(None, None, aux_input_data={'prefix_masks': prefix_masks}) if args.evaluation_steps > 0 and pipe.global_step % args.evaluation_steps == 0: test_loop(args, pipe, device, test_data_loader) if pipe.global_step % args.checkpoint_steps == 0: if do_sync_before_save: pipe.dp_optim.allreduce_parameters() if dp_rank == 0: save_checkpoint(pipe, args) if do_sync_before_save: pipe.dp_optim.rollback_parameters() def main(): parser = argparse.ArgumentParser(description='Gpipe-GPT') add_device_arguments(parser) add_torch_distributed_arguments(parser) add_model_arguments(parser) add_task_arguments(parser) add_training_hyper_parameter_arguments(parser) add_mixed_precision_arguments(parser) add_parallel_schema_arguments(parser) parser.add_argument('--model-name', type=str, default='gpt2', metavar='S', help='model name or path') parser.add_argument('--tokenizer-name', type=str, default='gpt2', metavar='S', help='tokenizer name or path') parser.add_argument('--model-type', type=str, default='gpt2', metavar='S', help='model name or path') parser.add_argument('--checkpoint-path', type=str, default='model_checkpoints/gpt2') parser.add_argument('--task-name', type=str, default='cot', metavar='S', help='task name') parser.add_argument('--warmup-steps', type=int, default=0, help='-') parser.add_argument('--train-warmup-steps', type=int, default=0, help='-') parser.add_argument('--total-steps', type=int, default=None, help='-') parser.add_argument('--load-pretrained-model', type=lambda x: x.lower()=='true', default=True, metavar='S', help='load pretrained model or not.') parser.add_argument('--load-checkpoint', type=lambda x: x.lower()=='true', default=True, metavar='S', help='load pretrained model or not.') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--profiling', type=str, default='no-profiling', metavar='S', help='enable which profiling? default: tidy mode') parser.add_argument('--trace-postfix', type=str, default='default', metavar='S', help='postfix of the tracing file name.') parser.add_argument('--evaluation-steps', type=int, default=0, metavar='S', help='every x steps, do evaluation. (0 means do not do evaluation)') parser.add_argument('--evaluation-data', type=str, default=None, help="path of eval data in jsonl") parser.add_argument('--evaluation-num-batch', type=int, default=None, help="for debug purpose, only eval the first several batch.") parser.add_argument('--checkpoint-steps', type=int, default=0, metavar='S', help='every x steps, save checkpoint. (0 means do not save checkpoint)') parser.add_argument('--net-interface', type=str, default='lo', metavar='S', help='net_interface') parser.add_argument('--job-id', type=str, default="0", metavar='S', help='an uuid') args = parser.parse_args() torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) if args.use_cuda: assert (torch.cuda.is_available()) device = torch.device('cuda', args.cuda_id) else: device = torch.device('cpu') init_communicators(args) use_dp = (args.world_size != args.pipeline_group_size) if use_dp: dp_comm = get_data_parallel_comm() dp_rank = get_data_parallel_rank() dp_size = get_data_parallel_world_size() else: dp_rank = 0 dp_size = 1 config = AutoConfig.from_pretrained(args.model_name) # num layer globally if hasattr(config, 'num_hidden_layers'): args.max_layers = config.num_hidden_layers elif hasattr(config, 'num_layers'): args.max_layers = config.num_layers else: args.max_layers = config.n_layer tokenizer = build_tokenizer(args) tokenizer.model_max_length = args.seq_length # config.vocab_size = tokenizer.vocab_size config.bos_token_id = tokenizer.bos_token_id config.eos_token_id = tokenizer.eos_token_id config.pad_token_id = tokenizer.pad_token_id print("token vocab size:", config.vocab_size) if get_pipeline_parallel_rank() == 0 and dp_rank == 0: train_data_loader = get_ul2r_train_data_loader(args, tokenizer) else: train_data_loader = None test_data_loader = None if args.total_steps is None: args.total_steps = len(train_data_loader) use_dp = (args.world_size != args.pipeline_group_size) if use_dp: print("Running ", args.pp_mode, " with data parallel.") else: print("Running ", args.pp_mode, " without data parallel.") pipe = get_pp_module(args, config, device, use_dp) if args.load_checkpoint: load_checkpoint(pipe, args) if args.fp16: pipe.optimizer.reload_model_params() if args.model_type == 'gptj': # make sure, causal mask is here. max_positions = config.n_positions for module in pipe.model.model: if hasattr(module, 'attn'): print('put back causal mask') module.attn.bias[:] = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view( 1, 1, max_positions, max_positions ) if args.profiling == 'no-profiling': train_loop(args, pipe, device, train_data_loader, test_data_loader) else: prefix = './trace_json/gpt3_' + args.pp_mode if use_dp: prefix = prefix + '_' + args.dp_mode trace_file = prefix + get_learning_arguments_str(args) + get_model_arguments_str(args) + \ get_dist_arguments_str(args) + get_mixed_precision_arguments_str(args) + '_' + \ args.profiling + '_' + args.trace_postfix + '.json' if args.profiling == 'tidy_profiling': try: train_loop(args, pipe, device, train_data_loader, test_data_loader) except Exception as e: raise e print(get_pipeline_parallel_rank(), e) pipe.export_profiling_result(filename=trace_file) elif args.profiling == 'pytorch_profiling': with profiler.profile(profile_memory=True, use_cuda=args.use_cuda) as prof: train_loop(args, pipe, device, train_data_loader, test_data_loader) print(prof.key_averages().table()) prof.export_chrome_trace(trace_file) else: print("No recognized profiler?") assert False print(get_pipeline_parallel_rank(), 'finished.') if __name__ == '__main__': main() ================================================ FILE: training/finetune_GPT-NeoXT-Chat-Base-20B.sh ================================================ DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) netif=lo export GLOO_SOCKET_IFNAME=${netif} export NCCL_SOCKET_IFNAME=${netif} export MODEL_NAME=GPT-Neo-XT-Chat-Base-20B export SHOW_DATA=0 BASE_MODEL="${DIR}/../pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b/" TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-20000} CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-100} CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} DATASETS="\ ${DIR}/../data/OIG/files/unified_ni.jsonl:0.2,\ ${DIR}/../data/OIG/files/unified_p3.jsonl:0.5,\ ${DIR}/../data/OIG/files/unified_flan.jsonl:0.2,\ ${DIR}/../data/OIG/files/unified_chip2.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_rallio_safety_and_prosocial.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_soda_dialog.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_unifiedskg_instructions.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_merged_code_xp3.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_oscar_en_sample_dialog.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_ul2_plus_oscar_en_sample_dialog.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_multi_news.jsonl:0.05,\ ${DIR}/../data/OIG/files/unified_openai_summarize_tldr.jsonl:0.05,\ ${DIR}/../data/OIG/files/unified_squad_v2.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_nq.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_poetry_instructions.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_sqlv2.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_unnatural_instructions.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_conv_finqa.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_essays.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_plot_screenplay_books_dialog.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_grade_school_math_instructions.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_mathqa_flanv2_kojma_cot.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_joke_explanations.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_cuad.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_abstract_infill.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_image_prompts_instructions.jsonl:0.01 \ " ARGS="--model-name ${BASE_MODEL} \ --tokenizer-name ${BASE_MODEL} \ --project-name together \ --model-type gptneox \ --optimizer adam \ --seed 42 \ --load-pretrained-model true \ --task-name \ "${DATASETS}" \ --checkpoint-path ${CHECKPOINT_PATH} \ --total-steps ${TOTAL_STEPS} --warmup-steps 10 --train-warmup-steps 0 \ --checkpoint-steps ${CHECKPOINT_STEPS} \ --lr 1e-6 --seq-length 2048 --batch-size 64 --micro-batch-size 1 --gradient-accumulate-step 1 \ --dist-url tcp://127.0.0.1:7033 \ --num-layers 6 --embedding-dim 6144 \ --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ --job-id 0 --net-interface ${netif} \ --fp16 \ --dp-backend nccl \ --dp-mode allreduce \ --pp-mode gpipe --profiling no-profiling" (trap 'kill 0' SIGINT; \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ & \ wait) ================================================ FILE: training/finetune_Pythia-Chat-Base-7B.sh ================================================ DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) netif=lo export GLOO_SOCKET_IFNAME=${netif} export NCCL_SOCKET_IFNAME=${netif} export MODEL_NAME=Pythia-Chat-Base-7B export SHOW_DATA=0 BASE_MODEL="${DIR}/../pretrained/Pythia-6.9B-deduped/EleutherAI_pythia-6.9b-deduped/" TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-20000} CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-100} CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} DATASETS="\ ${DIR}/../data/OIG/files/unified_ni.jsonl:0.2,\ ${DIR}/../data/OIG/files/unified_p3.jsonl:0.5,\ ${DIR}/../data/OIG/files/unified_flan.jsonl:0.2,\ ${DIR}/../data/OIG/files/unified_chip2.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_rallio_safety_and_prosocial.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_soda_dialog.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_unifiedskg_instructions.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_merged_code_xp3.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_oscar_en_sample_dialog.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_ul2_plus_oscar_en_sample_dialog.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_multi_news.jsonl:0.05,\ ${DIR}/../data/OIG/files/unified_openai_summarize_tldr.jsonl:0.05,\ ${DIR}/../data/OIG/files/unified_squad_v2.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_nq.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_poetry_instructions.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_sqlv2.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_unnatural_instructions.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_conv_finqa.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_essays.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_plot_screenplay_books_dialog.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_grade_school_math_instructions.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_mathqa_flanv2_kojma_cot.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_joke_explanations.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_cuad.jsonl:0.01,\ ${DIR}/../data/OIG/files/unified_abstract_infill.jsonl:0.1,\ ${DIR}/../data/OIG/files/unified_image_prompts_instructions.jsonl:0.01 \ " ARGS="--model-name ${BASE_MODEL} \ --tokenizer-name ${BASE_MODEL} \ --project-name together \ --model-type gptneox \ --optimizer adam \ --seed 42 \ --load-pretrained-model true \ --task-name \ "${DATASETS}" \ --checkpoint-path ${CHECKPOINT_PATH} \ --total-steps ${TOTAL_STEPS} --warmup-steps 10 --train-warmup-steps 0 \ --checkpoint-steps ${CHECKPOINT_STEPS} \ --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \ --dist-url tcp://127.0.0.1:7033 \ --num-layers 8 --embedding-dim 4096 \ --world-size 8 --pipeline-group-size 4 --data-group-size 2 \ --job-id 0 --net-interface ${netif} \ --fp16 \ --dp-backend nccl \ --dp-mode allreduce \ --pp-mode gpipe --profiling no-profiling" (trap 'kill 0' SIGINT; \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ & \ wait) ================================================ FILE: training/finetune_RedPajama-INCITE-7B-Chat.sh ================================================ DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) netif=lo export GLOO_SOCKET_IFNAME=${netif} export NCCL_SOCKET_IFNAME=${netif} export MODEL_NAME=redpajama-incite-chat-3b-sample export SHOW_DATA=0 BASE_MODEL="${DIR}/../pretrained/RedPajama-7B/togethercomputer_RedPajama-INCITE-7B-Chat" TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10} CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10} CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} DATASETS="${DIR}/../data/OIG-chip2/unified_chip2.jsonl:1" ARGS="--model-name ${BASE_MODEL} \ --tokenizer-name ${BASE_MODEL} \ --project-name together \ --model-type gptneox \ --optimizer adam \ --seed 42 \ --load-pretrained-model true \ --task-name \ "${DATASETS}" \ --checkpoint-path ${CHECKPOINT_PATH} \ --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \ --checkpoint-steps ${CHECKPOINT_STEPS} \ --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \ --dist-url tcp://127.0.0.1:7033 \ --num-layers 4 --embedding-dim 2560 \ --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ --job-id 0 --net-interface ${netif} \ --fp16 \ --dp-backend nccl \ --dp-mode allreduce \ --pp-mode gpipe --profiling no-profiling" (trap 'kill 0' SIGINT; \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ & \ wait) ================================================ FILE: training/finetune_RedPajama-INCITE-Chat-3B-v1.sh ================================================ DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) netif=lo export GLOO_SOCKET_IFNAME=${netif} export NCCL_SOCKET_IFNAME=${netif} export MODEL_NAME=redpajama-incite-chat-3b-sample export SHOW_DATA=0 BASE_MODEL="${DIR}/../pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1" TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10} CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10} CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} DATASETS="${DIR}/../data/OIG-chip2/unified_chip2.jsonl:1" ARGS="--model-name ${BASE_MODEL} \ --tokenizer-name ${BASE_MODEL} \ --project-name together \ --model-type gptneox \ --optimizer adam \ --seed 42 \ --load-pretrained-model true \ --task-name \ "${DATASETS}" \ --checkpoint-path ${CHECKPOINT_PATH} \ --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \ --checkpoint-steps ${CHECKPOINT_STEPS} \ --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \ --dist-url tcp://127.0.0.1:7033 \ --num-layers 4 --embedding-dim 2560 \ --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ --job-id 0 --net-interface ${netif} \ --fp16 \ --dp-backend nccl \ --dp-mode allreduce \ --pp-mode gpipe --profiling no-profiling" (trap 'kill 0' SIGINT; \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ & \ wait) ================================================ FILE: training/finetune_llama-2-7b-32k-booksum.sh ================================================ DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) netif=lo export GLOO_SOCKET_IFNAME=${netif} export NCCL_SOCKET_IFNAME=${netif} export MODEL_NAME=llama-2-7b-32k-booksum export SHOW_DATA=1 BASE_MODEL="${DIR}/../pretrained/Llama-2-7B-32K-beta/togethercomputer_Llama-2-7B-32K-beta" TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10} CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10} CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} DATASETS="https://huggingface.co/datasets/togethercomputer/Long-Data-Collections/resolve/main/fine-tune/booksum.jsonl.zst:1" ARGS="--model-name ${BASE_MODEL} \ --tokenizer-name ${BASE_MODEL} \ --project-name together \ --model-type llama \ --optimizer adam \ --seed 42 \ --load-pretrained-model true \ --task-name \ "${DATASETS}" \ --checkpoint-path ${CHECKPOINT_PATH} \ --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \ --checkpoint-steps ${CHECKPOINT_STEPS} \ --lr 2e-5 --seq-length 32768 --batch-size 4 --micro-batch-size 1 --gradient-accumulate-step 1 \ --dist-url tcp://127.0.0.1:7033 \ --num-layers 4 --embedding-dim 4096 \ --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ --job-id 0 --net-interface ${netif} \ --fp16 \ --dp-backend nccl \ --dp-mode allreduce \ --pp-mode gpipe --profiling no-profiling" (trap 'kill 0' SIGINT; \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ & \ wait) ================================================ FILE: training/finetune_llama-2-7b-32k-mqa.sh ================================================ DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) netif=lo export GLOO_SOCKET_IFNAME=${netif} export NCCL_SOCKET_IFNAME=${netif} export MODEL_NAME=llama-2-7b-32k-mqa export SHOW_DATA=1 BASE_MODEL="${DIR}/../pretrained/Llama-2-7B-32K-beta/togethercomputer_Llama-2-7B-32K-beta" TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10} CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10} CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} DATASETS="https://huggingface.co/datasets/togethercomputer/Long-Data-Collections/resolve/main/fine-tune/natural_questions_10_200_docs.jsonl.zst:1" ARGS="--model-name ${BASE_MODEL} \ --tokenizer-name ${BASE_MODEL} \ --project-name together \ --model-type llama \ --optimizer adam \ --seed 42 \ --load-pretrained-model true \ --task-name \ "${DATASETS}" \ --checkpoint-path ${CHECKPOINT_PATH} \ --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \ --checkpoint-steps ${CHECKPOINT_STEPS} \ --lr 2e-5 --seq-length 32768 --batch-size 4 --micro-batch-size 1 --gradient-accumulate-step 1 \ --dist-url tcp://127.0.0.1:7033 \ --num-layers 4 --embedding-dim 4096 \ --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ --job-id 0 --net-interface ${netif} \ --fp16 \ --dp-backend nccl \ --dp-mode allreduce \ --pp-mode gpipe --profiling no-profiling" (trap 'kill 0' SIGINT; \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ & \ python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ & \ wait) ================================================ FILE: training/lora/example/redpajama-incite-chat-3b.py ================================================ import os import json os.environ["CUDA_VISIBLE_DEVICES"]="0" import torch import transformers import torch.nn as nn import bitsandbytes as bnb from datasets import Dataset from peft import LoraConfig, get_peft_model from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM # this script should take around 14GB VRAM MODEL_NAME='redpajama-incite-chat-3b-sample-lowrank' # read datasets with open('data/OIG-chip2/unified_chip2.jsonl', 'r') as fp: data = [json.loads(x) for x in fp.readlines()] model = AutoModelForCausalLM.from_pretrained( "togethercomputer/RedPajama-INCITE-Chat-3B-v1", device_map='auto', ) tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Chat-3B-v1") tokenizer.pad_token = tokenizer.eos_token for param in model.parameters(): param.requires_grad = False # freeze the model - train adapters later if param.ndim == 1: # cast the small parameters (e.g. layernorm) to fp32 for stability param.data = param.data.to(torch.float32) model.gradient_checkpointing_enable() # reduce number of stored activations model.enable_input_require_grads() def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. """ trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() print( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" ) config = LoraConfig( r=16, lora_alpha=32, target_modules=["query_key_value", "xxx"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, config) print_trainable_parameters(model) ## Training data = Dataset.from_list(data) data = data.map(lambda samples: tokenizer(samples['text']), batched=True) trainer = transformers.Trainer( model=model, train_dataset=data, args=transformers.TrainingArguments( per_device_train_batch_size=4, gradient_accumulation_steps=4, warmup_steps=100, max_steps=200, learning_rate=2e-4, fp16=True, logging_steps=1, output_dir='outputs' ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) ) model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train() # save the trained adapter to disk model.save_pretrained(f"outputs/{MODEL_NAME}") ================================================ FILE: training/lora/example/redpajama-incite-chat-3b_inference.py ================================================ import torch from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer peft_model_path ='outputs/redpajama-incite-chat-3b-sample-lowrank' config = PeftConfig.from_pretrained(peft_model_path) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto') tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_path) batch = tokenizer(": Hello!\n:", return_tensors='pt') with torch.cuda.amp.autocast(): output_tokens = model.generate(**batch, max_new_tokens=50) print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) ================================================ FILE: training/modules/__init__.py ================================================ ================================================ FILE: training/modules/deberta_modules.py ================================================ import torch import numpy as np import math from torch import nn from torch.nn import functional from torch.utils.checkpoint import checkpoint from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, ) #### Hack Deberta ##### def make_log_bucket_position(relative_pos, bucket_size, max_position): sign = torch.sign(relative_pos) mid = bucket_size // 2 abs_pos = torch.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, torch.abs(relative_pos)) log_pos = torch.ceil(torch.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1)) + mid bucket_pos = torch.where(abs_pos <= mid, relative_pos.type(log_pos.dtype), log_pos * sign).long() return bucket_pos def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1, device='cpu'): q_ids = torch.arange(0, query_size, device=device) k_ids = torch.arange(0, key_size, device=device) rel_pos_ids = q_ids[:, None] - torch.tile(k_ids, (q_ids.shape[0], 1)) if bucket_size > 0 and max_position > 0: rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position) rel_pos_ids = rel_pos_ids[:query_size, :] rel_pos_ids = rel_pos_ids.unsqueeze(0) return rel_pos_ids from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax, StableDropout class DisentangledSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0: raise ValueError( f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " f"heads ({config.num_attention_heads})" ) self.num_attention_heads = config.num_attention_heads _attention_head_size = config.hidden_size // config.num_attention_heads self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) self.share_att_key = getattr(config, "share_att_key", False) self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else [] self.relative_attention = getattr(config, "relative_attention", False) if self.relative_attention: self.position_buckets = getattr(config, "position_buckets", -1) self.max_relative_positions = getattr(config, "max_relative_positions", -1) if self.max_relative_positions < 1: self.max_relative_positions = config.max_position_embeddings self.pos_ebd_size = self.max_relative_positions if self.position_buckets > 0: self.pos_ebd_size = self.position_buckets self.pos_dropout = StableDropout(config.hidden_dropout_prob) if not self.share_att_key: if "c2p" in self.pos_att_type: self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) if "p2c" in self.pos_att_type: self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = StableDropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, attention_heads): new_x_shape = x.size()[:-1] + (attention_heads, -1) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1)) def forward( self, hidden_states, attention_mask, output_attentions=False, query_states=None, relative_pos=None, rel_embeddings=None, ): if query_states is None: query_states = hidden_states query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads) key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads) value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads) rel_att = None # Take the dot product between "query" and "key" to get the raw attention scores. scale_factor = 1 if "c2p" in self.pos_att_type: scale_factor += 1 if "p2c" in self.pos_att_type: scale_factor += 1 scale = math.sqrt(query_layer.size(-1) * scale_factor) attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale if self.relative_attention: rel_embeddings = self.pos_dropout(rel_embeddings) rel_att = self.disentangled_attention_bias( query_layer, key_layer, relative_pos, rel_embeddings, scale_factor ) if rel_att is not None: attention_scores = attention_scores + rel_att attention_scores = attention_scores attention_scores = attention_scores.view( -1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1) ) # bsz x height x length x dimension attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) attention_probs = self.dropout(attention_probs) context_layer = torch.bmm( attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer ) context_layer = ( context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1)) .permute(0, 2, 1, 3) .contiguous() ) new_context_layer_shape = context_layer.size()[:-2] + (-1,) context_layer = context_layer.view(*new_context_layer_shape) if output_attentions: return (context_layer, attention_probs) else: return context_layer def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): if relative_pos is None: q = query_layer.size(-2) relative_pos = build_relative_position( q, key_layer.size(-2), bucket_size=self.position_buckets, max_position=self.max_relative_positions, device=query_layer.device, ) if relative_pos.dim() == 2: relative_pos = relative_pos.unsqueeze(0).unsqueeze(0) elif relative_pos.dim() == 3: relative_pos = relative_pos.unsqueeze(1) # bsz x height x query x key elif relative_pos.dim() != 4: raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}") att_span = self.pos_ebd_size rel_embeddings = rel_embeddings[0 : att_span * 2, :].unsqueeze(0) if self.share_att_key: pos_query_layer = self.transpose_for_scores( self.query_proj(rel_embeddings), self.num_attention_heads ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1) pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads).repeat( query_layer.size(0) // self.num_attention_heads, 1, 1 ) else: if "c2p" in self.pos_att_type: pos_key_layer = self.transpose_for_scores( self.pos_key_proj(rel_embeddings), self.num_attention_heads ).repeat( query_layer.size(0) // self.num_attention_heads, 1, 1 ) # .split(self.all_head_size, dim=-1) if "p2c" in self.pos_att_type: pos_query_layer = self.transpose_for_scores( self.pos_query_proj(rel_embeddings), self.num_attention_heads ).repeat( query_layer.size(0) // self.num_attention_heads, 1, 1 ) # .split(self.all_head_size, dim=-1) score = 0 # content->position if "c2p" in self.pos_att_type: scale = math.sqrt(pos_key_layer.size(-1) * scale_factor) c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2)) c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1) c2p_att = torch.gather( c2p_att, dim=-1, index=c2p_pos.squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)]), ) score += c2p_att / scale # position->content if "p2c" in self.pos_att_type: scale = math.sqrt(pos_query_layer.size(-1) * scale_factor) if key_layer.size(-2) != query_layer.size(-2): r_pos = build_relative_position( key_layer.size(-2), key_layer.size(-2), bucket_size=self.position_buckets, max_position=self.max_relative_positions, device=query_layer.device, ) r_pos = r_pos.unsqueeze(0) else: r_pos = relative_pos p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1) p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2)) p2c_att = torch.gather( p2c_att, dim=-1, index=p2c_pos.squeeze(0).expand([query_layer.size(0), key_layer.size(-2), key_layer.size(-2)]), ).transpose(-1, -2) score += p2c_att / scale return score import transformers.models.deberta_v2.modeling_deberta_v2 transformers.models.deberta_v2.modeling_deberta_v2.DisentangledSelfAttention = DisentangledSelfAttention #### Hack Deberta ##### from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2Embeddings, ConvLayer from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2Layer from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2Encoder as _DebertaV2Encoder from transformers.models.deberta_v2.configuration_deberta_v2 import DebertaV2Config from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout, ContextPooler class DebertaV2Layers(_DebertaV2Encoder): def __init__(self, config, first_block=False): super(_DebertaV2Encoder, self).__init__() self.layer = nn.ModuleList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)]) self.relative_attention = getattr(config, "relative_attention", False) if self.relative_attention: self.max_relative_positions = getattr(config, "max_relative_positions", -1) if self.max_relative_positions < 1: self.max_relative_positions = config.max_position_embeddings self.position_buckets = getattr(config, "position_buckets", -1) pos_ebd_size = self.max_relative_positions * 2 if self.position_buckets > 0: pos_ebd_size = self.position_buckets * 2 self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size) self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")] if "layer_norm" in self.norm_rel_ebd: self.LayerNorm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True) if first_block: self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None else: self.conv = None self.gradient_checkpointing = True # TODO if hasattr(self, 'LayerNorm'): for p in self.LayerNorm.parameters(): p.requires_grad = False if hasattr(self, 'rel_embeddings'): for p in self.rel_embeddings.parameters(): p.requires_grad = False def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): if self.relative_attention and relative_pos is None: q = query_states.size(-2) if query_states is not None else hidden_states.size(-2) relative_pos = build_relative_position( q, hidden_states.size(-2), bucket_size=self.position_buckets, max_position=self.max_relative_positions, device=hidden_states.device, ) return relative_pos def forward( self, hidden_states, attention_mask, query_states=None, relative_pos=None, ): if attention_mask.dim() <= 2: input_mask = attention_mask else: input_mask = (attention_mask.sum(-2) > 0).byte() attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) next_kv = hidden_states # TODOs rel_embeddings = self.get_rel_embedding() output_states = next_kv for i, layer_module in enumerate(self.layer): if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs) return custom_forward output_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), next_kv, attention_mask, query_states, relative_pos, rel_embeddings, ) else: output_states = layer_module( next_kv, attention_mask, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings, ) if i == 0 and self.conv is not None: output_states = self.conv(hidden_states, output_states, input_mask) next_kv = output_states return output_states class DebertaClassificationHead(nn.Module): def __init__(self, config): super().__init__() self.config = config self.pooler = ContextPooler(config) self.classifier = nn.Linear( self.pooler.output_dim, getattr(config, "num_labels", 2), ) drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out self.dropout = StableDropout(drop_out) def forward(self, hidden_states, input_ids=None): pooled_output = self.pooler(hidden_states) pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) return logits ================================================ FILE: training/modules/dist_deberta_pp_module.py ================================================ from torch import nn from .deberta_modules import DebertaV2Embeddings, DebertaV2Layers, DebertaClassificationHead class DebertaStageBase(nn.Module): def __init__(self, args, config): super().__init__() self._to_cpu = False # (args.dist_backend == "gloo") self.config = config def _create_first_layer(self): return DebertaV2Embeddings(self.config) def _create_last_layer(self): return DebertaClassificationHead(self.config) def _create_transformer_layers(self, first_block=False): return DebertaV2Layers(self.config, first_block=first_block) # TODO: checkpoint class DebertaStageFirst(DebertaStageBase): def __init__(self, args, config, device): super().__init__(args, config) self.device = device self.embeddings = self._create_first_layer().to(device) self.encoder = self._create_transformer_layers(first_block=True).to(device) def forward(self, x, token_type_ids=None, attention_mask=None): if self._to_cpu: x = x.to(self.device) if token_type_ids is not None: token_type_ids = token_type_ids.to(self.device) if attention_mask is not None: attention_mask = attention_mask.to(self.device) x = self.embeddings(x, token_type_ids=token_type_ids) out = self.encoder(x, attention_mask=attention_mask) return out.cpu() if self._to_cpu else out class DebertaStageMiddle(DebertaStageBase): def __init__(self, args, config, device): super().__init__(args, config) self.device = device self.encoder = self._create_transformer_layers(first_block=False).to(device) def forward(self, x, attention_mask=None): if self._to_cpu: x = x.to(self.device) if attention_mask is not None: attention_mask = attention_mask.to(self.device) out = self.encoder(x, attention_mask=attention_mask) return out.cpu() if self._to_cpu else out class DebertaStageLast(DebertaStageBase): def __init__(self, args, config, device): super().__init__(args, config) self.device = device self.encoder = self._create_transformer_layers(first_block=False).to(device) self.output_head = self._create_last_layer().to(device) def forward(self, x, attention_mask=None, input_ids=None): if self._to_cpu: x = x.to(self.device) if attention_mask is not None: attention_mask = attention_mask.to(self.device) x = self.encoder(x, attention_mask=attention_mask) out = self.output_head(x) return out.cpu() if self._to_cpu else out ================================================ FILE: training/modules/dist_gpt_fsdp_module.py ================================================ import torch from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP from .task_modules import GlueClassification from .gpt_modules import MultiHeadAttention, TwoLayerMLP, GPTEmbedding from fairscale.nn.checkpoint import checkpoint_wrapper # This is only implemented to support checkpoint in FSDP class GPTTransformerFsdpLayer(torch.nn.Module): def __init__(self, model_dim, head_num, feedforward_dim=2048, layer_norm_eps=1e-5, use_checkpoint=True, explicit_fsdp=False) -> None: super(GPTTransformerFsdpLayer, self).__init__() self.attn = MultiHeadAttention(model_dim, head_num) if use_checkpoint: self.attn = checkpoint_wrapper(self.attn) if explicit_fsdp: self.attn = FSDP(self.attn, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False, flatten_parameters=False) # Implementation of Feedforward model self.mlp = TwoLayerMLP(model_dim, feedforward_dim) if use_checkpoint: self.mlp = checkpoint_wrapper(self.mlp) if explicit_fsdp: self.attn = FSDP(self.attn, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False, flatten_parameters=False) self.norm1 = torch.nn.LayerNorm(model_dim, eps=layer_norm_eps) self.norm2 = torch.nn.LayerNorm(model_dim, eps=layer_norm_eps) # self.dropout1 = nn.Dropout(dropout) # self.dropout2 = nn.Dropout(dropout) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.norm1(x) # x = x + self.dropout_1(self.attn(x2, x2, x2)) x.requires_grad_(True) x = self.attn(x) x = self.norm2(x) # x = x + self.dropout_2(self.ff(x2)) x.requires_grad_(True) x = self.mlp(x) return x class GPTGlueFsdpModel(torch.nn.Module): def __init__(self, args, vocab_size, num_classes, use_checkpoint=True): super(GPTGlueFsdpModel, self).__init__() self.embedding = GPTEmbedding(vocab_size, args.embedding_dim, args.seq_length) module_list = [] for _ in range(args.num_layers): module_list.append(GPTTransformerFsdpLayer(args.embedding_dim, args.num_heads, args.embedding_dim * 4, use_checkpoint, explicit_fsdp=False)) self.transformers = torch.nn.Sequential(*module_list) self.classifier = GlueClassification(args.embedding_dim, num_classes) def forward(self, input_ids, position_ids): input_emb = self.embedding(input_ids, position_ids) output_emb = self.transformers(input_emb) return self.classifier(output_emb) class GPTFsdpStageBase(torch.nn.Module): def __init__(self, args, num_stage_layers, vocab_size, num_classes, use_checkpoint=True, explicit_fsdp=True): super(GPTFsdpStageBase, self).__init__() self._vocab_size = vocab_size self._explicit_fsdp = explicit_fsdp self._use_checkpoint = use_checkpoint self._embedding_dim = args.embedding_dim # embedding dimension self._seq_length = args.seq_length self._num_classes = num_classes # the dimension of the feedforward aws_network model in nn.TransformerEncoder self._feedforward_dim = args.embedding_dim * 4 self._num_heads = args.num_heads # the number of heads in the multi-head attention models self._num_layers = num_stage_layers def _create_first_layer(self): emb = GPTEmbedding(self._vocab_size, self._embedding_dim, self._seq_length) if self._explicit_fsdp: return FSDP(emb, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False, flatten_parameters=False) else: return emb def _create_last_layer(self): classifier = GlueClassification(self._embedding_dim, self._num_classes) if self._explicit_fsdp: return FSDP(classifier, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False, flatten_parameters=False) else: return classifier def _create_fsdp_transformer_layer(self): return GPTTransformerFsdpLayer(self._embedding_dim, self._num_heads, self._feedforward_dim, use_checkpoint=self._use_checkpoint, explicit_fsdp=self._explicit_fsdp) class GPTFsdpStageFirst(GPTFsdpStageBase): def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True): super(GPTFsdpStageFirst, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint, explicit_fsdp) self.device = device module_list = [self._create_first_layer()] for _ in range(self._num_layers): module_list.append(self._create_fsdp_transformer_layer()) self.model = torch.nn.Sequential(*module_list).to(device) def forward(self, x): out = self.model(x) return out class GPTFsdpStageMiddle(GPTFsdpStageBase): def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True): super(GPTFsdpStageMiddle, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint, explicit_fsdp) self.device = device module_list = [] for _ in range(self._num_layers): module_list.append(self._create_fsdp_transformer_layer()) self.model = torch.nn.Sequential(*module_list).to(device) def forward(self, x): out = self.model(x) return out class GPTFsdpStageLast(GPTFsdpStageBase): def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True): super(GPTFsdpStageLast, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint, explicit_fsdp) self.device = device module_list = [] for _ in range(self._num_layers): module_list.append(self._create_fsdp_transformer_layer()) module_list.append(self._create_last_layer()) self.model = torch.nn.Sequential(*module_list).to(device) def forward(self, x): out = self.model(x) return out ================================================ FILE: training/modules/dist_gpt_pp_module.py ================================================ import numpy as np from torch import nn from comm.comm_utils import * from copy import deepcopy class GPTStageBase(nn.Module): def __init__(self, args, config): super(GPTStageBase, self).__init__() self._to_cpu = (args.dist_backend == "gloo") self._embedding_dim = args.embedding_dim # embedding dimension self._seq_length = args.seq_length # the dimension of the feedforward aws_network model in nn.TransformerEncoder self._feedforward_dim = args.embedding_dim * 4 self._num_heads = args.num_heads # the number of heads in the multi-head attention models self._num_layers = args.num_layers self._layer_begin = get_pipeline_parallel_rank() * args.num_layers self._layer_end = min(self._layer_begin + args.num_layers, args.max_layers) self._task_type = getattr(args, 'task_type', 'language_model') self.load_pretrained_model = args.load_pretrained_model self.model_name = args.model_name self.config = config if hasattr(args, 'model_type'): if args.model_type == "gpt2": from .hf_gpt2_modules import GPTEmbeddings, GPTBlock, GPTLMHead elif args.model_type == "gptj": from .hf_gptj_modules import GPTEmbeddings, GPTBlock, GPTLMHead elif args.model_type == "gptneox": from .hf_gptneox_modules import GPTEmbeddings, GPTBlock, GPTLMHead elif args.model_type == 'llama': from .llama_modules import GPTEmbeddings, GPTBlock, GPTLMHead else: raise Exception("unknown") else: raise Exception("!!!! model type not defined") self._GPTEmbeddings = GPTEmbeddings self._GPTBlock = GPTBlock self._GPTLMHead = GPTLMHead def _create_first_layer(self): layer = self._GPTEmbeddings(deepcopy(self.config)) if self.load_pretrained_model: print('loading embs') ret = layer.load_state_dict( torch.load(f'{self.model_name}/pytorch_embs.pt'), strict=False ) if len(ret.missing_keys): print('The following weight keys are missing:') print(ret.missing_keys) if len(ret.unexpected_keys): print('The following weight keys are unexpected:') print(ret.unexpected_keys) return layer def _create_last_layer(self): layer = self._GPTLMHead(deepcopy(self.config)) if self.load_pretrained_model: print('loading lm_head') ret = layer.load_state_dict( torch.load(f'{self.model_name}/pytorch_lm_head.pt'), strict=False ) if len(ret.missing_keys): print('The following weight keys are missing:') print(ret.missing_keys) if len(ret.unexpected_keys): print('The following weight keys are unexpected:') print(ret.unexpected_keys) return layer def _create_transformer_layer(self, layer_idx=0): config = deepcopy(self.config) layer = self._GPTBlock(config, layer_id=layer_idx) # TODO: checkpoint if self.load_pretrained_model: print(f'loading layer {layer_idx}') ret = layer.load_state_dict( torch.load(f'{self.model_name}/pytorch_{layer_idx}.pt'), strict=False ) if len(ret.missing_keys): print('The following weight keys are missing:') print(ret.missing_keys) if len(ret.unexpected_keys): print('The following weight keys are unexpected:') print(ret.unexpected_keys) return layer class GPTStageFull(GPTStageBase): def __init__(self, args, config, device): super(GPTStageFull, self).__init__(args, config) self.device = device module_list = [self._create_first_layer()] for layer_idx in range(self._layer_begin, self._layer_end): module_list.append(self._create_transformer_layer(layer_idx=layer_idx)) if hasattr(args, 'skip_lm_head') and args.skip_lm_head: pass else: module_list.append(self._create_last_layer()) self.model = nn.Sequential(*module_list).to(device) def forward(self, x, **kargs): for module in self.model: x = module(x, **kargs) return x class GPTStageFirst(GPTStageBase): def __init__(self, args, config, device): super(GPTStageFirst, self).__init__(args, config) self.device = device module_list = [self._create_first_layer()] for layer_idx in range(self._layer_begin, self._layer_end): module_list.append(self._create_transformer_layer(layer_idx=layer_idx)) self.model = nn.Sequential(*module_list).to(device) def forward(self, x, **kargs): for module in self.model: x = module(x, **kargs) return x # out = self.model(x.to(self.device), **kargs) # return out.cpu() if self._to_cpu else out class GPTStageMiddle(GPTStageBase): def __init__(self, args, config, device): super(GPTStageMiddle, self).__init__(args, config) self.device = device module_list = [] for layer_idx in range(self._layer_begin, self._layer_end): module_list.append(self._create_transformer_layer(layer_idx=layer_idx)) self.model = nn.Sequential(*module_list).to(device) def forward(self, x, **kargs): for module in self.model: x = module(x, **kargs) return x # out = self.model(x.to(self.device), **kargs) if self._to_cpu else self.model(x) # return out.cpu() if self._to_cpu else out class GPTStageLast(GPTStageBase): def __init__(self, args, config, device): super(GPTStageLast, self).__init__(args, config) self.device = device module_list = [] for layer_idx in range(self._layer_begin, self._layer_end): module_list.append(self._create_transformer_layer(layer_idx=layer_idx)) if hasattr(args, 'skip_lm_head') and args.skip_lm_head: pass else: module_list.append(self._create_last_layer()) self.model = nn.Sequential(*module_list).to(device) # self.upscale_last = nn.Linear(args.embedding_dim, 9216).to(device) def forward(self, x, **kargs): for module in self.model: x = module(x, **kargs) return x # def forward(self, x, **kargs): # for module in self.model[:-1]: # x = module(x, **kargs) # hid = x # x = self.model[-1](x, **kargs) # hid = self.upscale_last(hid) # loss = torch.nn.functional.mse_loss(hid, kargs['teacher_hidden_states']) # print(loss.item()) # return x, loss ================================================ FILE: training/modules/hf_gpt2_modules.py ================================================ import torch import math import numpy as np from torch import nn from torch.nn import functional from torch.utils.checkpoint import checkpoint from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, ) from transformers.models.gpt2.modeling_gpt2 import GPT2Attention as _GPT2Attention from transformers.models.gpt2.modeling_gpt2 import GPT2MLP as _GPT2MLP from transformers.models.gpt2.modeling_gpt2 import GPT2Block as _GPT2Block from transformers.models.gpt2.modeling_gpt2 import GPT2Model as _GPT2Model from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel as _GPT2LMHeadModel from transformers.models.gpt2.modeling_gpt2 import GPT2ForSequenceClassification as _GPT2ForSequenceClassification from transformers.models.gpt2.configuration_gpt2 import GPT2Config as GPTConfig from typing import Optional, Tuple, Union # @torch.jit.script def gpt_loss_func(input, target): lm_logits, labels = input, target shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() loss = functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return loss class GPTEmbeddings(nn.Module): def __init__(self, config): super().__init__() self.config = config self.embed_dim = config.hidden_size self.wte = nn.Embedding(config.vocab_size, self.embed_dim) self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.drop = nn.Dropout(config.embd_pdrop) def forward(self, input_ids, **kargs): device = input_ids.device # input ids input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] # position ids position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds hidden_states = self.drop(hidden_states) return hidden_states class GPTAttention(_GPT2Attention): def _attn(self, query, key, value, attention_mask=None, head_mask=None, prefix_masks=None): attn_weights = torch.matmul(query, key.transpose(-1, -2)) if self.scale_attn_weights: attn_weights = attn_weights / torch.tensor( value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device ) # Layer-wise attention scaling if self.scale_attn_by_inverse_layer_idx: attn_weights = attn_weights / float(self.layer_idx + 1) if not self.is_cross_attention: # if only "normal" attention layer implements causal mask query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) mask_value = torch.finfo(attn_weights.dtype).min # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` if prefix_masks is not None: bsz = query.size(0) causal_mask = causal_mask.repeat(bsz, 1, 1, 1) # (bsz, 1, src_len, tgt_len) causal_mask = causal_mask.permute(0, 3, 1, 2) # (bsz, tgt_len, 1, src_len) causal_mask[prefix_masks.bool()] = 1 causal_mask = causal_mask.permute(0, 2, 3, 1) # (bsz, 1, src_len, tgt_len) mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device) attn_weights = torch.where(causal_mask, attn_weights, mask_value) if attention_mask is not None: # Apply the attention mask attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise attn_weights = attn_weights.type(value.dtype) attn_weights = self.attn_dropout(attn_weights) # Mask heads if we want to if head_mask is not None: attn_weights = attn_weights * head_mask attn_output = torch.matmul(attn_weights, value) return attn_output, attn_weights def forward( self, hidden_states: Optional[Tuple[torch.FloatTensor]], layer_past: Optional[Tuple[torch.Tensor]] = None, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, prefix_masks = None, ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]: if encoder_hidden_states is not None: if not hasattr(self, "q_attn"): raise ValueError( "If class is used as cross attention, the weights `q_attn` have to be defined. " "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`." ) query = self.q_attn(hidden_states) key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2) attention_mask = encoder_attention_mask else: query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2) query = self._split_heads(query, self.num_heads, self.head_dim) key = self._split_heads(key, self.num_heads, self.head_dim) value = self._split_heads(value, self.num_heads, self.head_dim) if layer_past is not None: past_key, past_value = layer_past key = torch.cat((past_key, key), dim=-2) value = torch.cat((past_value, value), dim=-2) if use_cache is True: present = (key, value) else: present = None if self.reorder_and_upcast_attn: attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask) else: attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask, prefix_masks=prefix_masks) attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) attn_output = self.c_proj(attn_output) attn_output = self.resid_dropout(attn_output) outputs = (attn_output, present) if output_attentions: outputs += (attn_weights,) return outputs # a, present, (attentions) class GPTBlock(_GPT2Block): def __init__(self, config, layer_idx=None, use_checkpoint=True): super(_GPT2Block, self).__init__() hidden_size = config.hidden_size inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = GPTAttention(config, layer_idx=layer_idx) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = _GPT2MLP(inner_dim, config) self.config = config self.use_checkpoint = use_checkpoint def attn_res(x: torch.Tensor, prefix_masks: torch.Tensor) -> torch.Tensor: res = x x = self.ln_1(x) x = self.attn(x, prefix_masks=prefix_masks)[0] return x + res self.attn_res = attn_res def mlp_res(x: torch.Tensor) -> torch.Tensor: res = x x = self.ln_2(x) x = self.mlp(x) return x + res self.mlp_res = mlp_res def forward(self, x: torch.Tensor, prefix_masks=None, **kargs) -> torch.Tensor: if not self.training: x = self.attn_res(x, prefix_masks=prefix_masks) x = self.mlp_res(x) return x if self.use_checkpoint: x.requires_grad_(True) x = checkpoint(self.attn_res, x, prefix_masks) else: x = self.attn_res(x, prefix_masks=prefix_masks) if self.use_checkpoint: x.requires_grad_(True) x = checkpoint(self.mlp_res, x) else: x = self.mlp_res(x) return x class GPTModel(_GPT2Model): def __init__(self, config): super(_GPT2Model, self).__init__(config) self.embed_dim = config.hidden_size emb_layer = GPTEmbeddings(config) self.wte = emb_layer.wte self.wpe = emb_layer.wpe self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([GPTBlock(config, layer_idx=i, use_checkpoint=True) for i in range(config.num_hidden_layers)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) # Model parallel self.model_parallel = False self.device_map = None self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def forward(self, input_ids, attention_mask=None, **kargs): device = input_ids.device # input ids input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_shape[0] # position ids position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds hidden_states = self.drop(hidden_states) hidden_states_tuple = tuple() for layer in self.h: hidden_states_tuple = hidden_states_tuple + (hidden_states,) hidden_states = layer(hidden_states) hidden_states = self.ln_f(hidden_states) hidden_states_tuple = hidden_states_tuple + (hidden_states,) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=None, hidden_states=hidden_states_tuple, attentions=None, cross_attentions=None, ) class GPTLMHead(nn.Module): def __init__(self, config): super().__init__() self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) def forward(self, x, **kargs): x = self.ln_f(x) x = self.lm_head(x) return x class GPTLMHeadModel(_GPT2LMHeadModel): def __init__(self, config): super(_GPT2LMHeadModel, self).__init__(config) self.transformer = GPTModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # ln_f will be calculated in self.transformer # Model parallel self.model_parallel = False self.device_map = None # Initialize weights and apply final processing self.post_init() class GPTClassificationHead(nn.Module): def __init__(self, config): super().__init__() self.config = config self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.score = nn.Linear(config.n_embd, config.num_labels, bias=False) def forward(self, hidden_states, input_ids=None): batch_size, sequence_length = hidden_states.shape[:2] if input_ids is not None: sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1 else: sequence_lengths = -1 pooled_hidden_states = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths] logits = self.score(self.ln_f(pooled_hidden_states)) return logits class GPTForClassification(_GPT2ForSequenceClassification): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = GPTModel(config) self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) # Model parallel self.model_parallel = False self.device_map = None # Initialize weights and apply final processing self.post_init() # def forward(self, input_ids, labels=None): # ret = self.transformer(input_ids) # pool_hidden_state = ret.last_hidden_state[:, -1] # logits = self.score(pool_hidden_state) # loss = functional.cross_entropy(logits, labels) # return loss ================================================ FILE: training/modules/hf_gptj_modules.py ================================================ import os import torch import math import numpy as np from torch import nn from torch.nn import functional from torch.utils.checkpoint import checkpoint from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, ) from transformers.models.gptj.modeling_gptj import ACT2FN from transformers.models.gptj.modeling_gptj import GPTJAttention as _GPTJAttention from transformers.models.gptj.modeling_gptj import GPTJMLP as _GPTJMLP from transformers.models.gptj.modeling_gptj import GPTJBlock as _GPTJBlock from transformers.models.gptj.modeling_gptj import GPTJModel as _GPTJModel from transformers.models.gptj.modeling_gptj import fixed_pos_embedding from transformers.models.gptj.configuration_gptj import GPTJConfig as GPTConfig from transformers.models.gptj.modeling_gptj import fixed_pos_embedding, rotate_every_two, apply_rotary_pos_emb # @torch.jit.script def gpt_loss_func(input, target): lm_logits, labels = input, target shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() loss = functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return loss # put things on GPU to avoid high CPU usage def fixed_pos_embedding(x, seq_dim=1, seq_len=None): dim = x.shape[-1] if seq_len is None: seq_len = x.shape[seq_dim] inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=x.device) / dim)) sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(seq_len, device=x.device), inv_freq).float() return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp) class GPTJMLP(_GPTJMLP): def __init__(self, intermediate_size, config, device='cpu'): # in MLP: intermediate_size= 4 * embed_dim super(_GPTJMLP, self).__init__() embed_dim = config.n_embd self.fc_in = nn.Linear(embed_dim, intermediate_size, device=device) self.fc_out = nn.Linear(intermediate_size, embed_dim, device=device) self.act = ACT2FN[config.activation_function] self.dropout = nn.Dropout(config.resid_pdrop) class GPTJAttention(_GPTJAttention): def __init__(self, config, device='cpu'): super(_GPTJAttention, self).__init__() max_positions = config.max_position_embeddings self.register_buffer( "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view( 1, 1, max_positions, max_positions ), ) self.register_buffer("masked_bias", torch.tensor(-1e9)) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) self.embed_dim = config.hidden_size self.num_attention_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_attention_heads if self.head_dim * self.num_attention_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and" f" `num_attention_heads`: {self.num_attention_heads})." ) self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()) self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False, device=device) self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False, device=device) self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False, device=device) self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False, device=device) self.rotary_dim = None if config.rotary_dim is not None: self.rotary_dim = config.rotary_dim def _attn( self, query, key, value, attention_mask=None, head_mask=None, prefix_masks=None, ): # compute causal mask from causal mask buffer query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) if prefix_masks is not None: bsz = query.size(0) causal_mask = causal_mask.repeat(bsz, 1, 1, 1) # (bsz, 1, src_len, tgt_len) causal_mask = causal_mask.permute(0, 3, 1, 2) # (bsz, tgt_len, 1, src_len) causal_mask[prefix_masks.bool()] = 1 causal_mask = causal_mask.permute(0, 2, 3, 1) # (bsz, 1, src_len, tgt_len) # Keep the attention weights computation in fp32 to avoid overflow issues query = query.to(torch.float32) key = key.to(torch.float32) attn_weights = torch.matmul(query, key.transpose(-1, -2)) mask_value = torch.finfo(attn_weights.dtype).min # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device) attn_weights = torch.where(causal_mask, attn_weights, mask_value) attn_weights = attn_weights / self.scale_attn if attention_mask is not None: # Apply the attention mask attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) attn_weights = attn_weights.to(value.dtype) attn_weights = self.attn_dropout(attn_weights) # Mask heads if we want to if head_mask is not None: attn_weights = attn_weights * head_mask attn_output = torch.matmul(attn_weights, value) return attn_output, attn_weights def forward( self, hidden_states, attention_mask=None, layer_past=None, head_mask=None, offset=None, use_cache=False, output_attentions=False, prefix_masks=None, ): query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) query = self._split_heads(query, self.num_attention_heads, self.head_dim, True) key = self._split_heads(key, self.num_attention_heads, self.head_dim, True) value = self._split_heads(value, self.num_attention_heads, self.head_dim, False) seq_len = key.shape[1] if layer_past is not None: if offset is None: offset = layer_past[0].shape[-2] seq_len += layer_past[0].shape[-2] if offset is None: offset = 0 if self.rotary_dim is not None: k_rot = key[:, :, :, : self.rotary_dim] k_pass = key[:, :, :, self.rotary_dim :] q_rot = query[:, :, :, : self.rotary_dim] q_pass = query[:, :, :, self.rotary_dim :] sincos = fixed_pos_embedding(k_rot, 1, seq_len=seq_len) k_rot = apply_rotary_pos_emb(k_rot, sincos, offset=offset) q_rot = apply_rotary_pos_emb(q_rot, sincos, offset=offset) key = torch.cat([k_rot, k_pass], dim=-1) query = torch.cat([q_rot, q_pass], dim=-1) else: sincos = fixed_pos_embedding(key, 1, seq_len=seq_len) key = apply_rotary_pos_emb(key, sincos, offset=offset) query = apply_rotary_pos_emb(query, sincos, offset=offset) key = key.permute(0, 2, 1, 3) query = query.permute(0, 2, 1, 3) if layer_past is not None: past_key = layer_past[0] past_value = layer_past[1] key = torch.cat((past_key, key), dim=-2) value = torch.cat((past_value, value), dim=-2) if use_cache is True: present = (key, value) else: present = None # compute self-attention: V x Softmax(QK^T) attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask, prefix_masks=prefix_masks) attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim) attn_output = self.out_proj(attn_output) attn_output = self.resid_dropout(attn_output) outputs = (attn_output, present) if output_attentions: outputs += (attn_weights,) return outputs # a, present, (attentions) class GPTEmbeddings(nn.Module): def __init__(self, config, device='cpu'): super().__init__() self.config = config self.embed_dim = config.hidden_size self.wte = nn.Embedding(config.vocab_size, self.embed_dim, device=device) @classmethod def from_pretrained(cls, model_path, config=None): if config is None: config = GPTConfig.from_pretrained(model_path) # module = cls(config).eval() module = torch.nn.utils.skip_init(cls, config).eval() # fast init try: module.load_state_dict(torch.load(os.path.join( model_path, 'pytorch_embs.pt', ))) except: print(f'Cannot load from . The model is randomly initialized.') return module def forward(self, input_ids, *args, **kargs): # input ids input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) hidden_states = self.wte(input_ids) return hidden_states class GPTBlock(_GPTJBlock): def __init__(self, config, *args, use_checkpoint=True, device='cpu', **kargs): super(_GPTJBlock, self).__init__() inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon, device=device) self.attn = GPTJAttention(config, device=device) self.mlp = GPTJMLP(inner_dim, config, device=device) self.config = config self.use_checkpoint = use_checkpoint def block_forward(x: torch.Tensor, attention_mask: torch.Tensor, prefix_masks: torch.Tensor) -> torch.Tensor: res = x x = self.ln_1(x) x_a = self.attn(x, prefix_masks=prefix_masks, attention_mask=attention_mask)[0] x_m = self.mlp(x) return res + x_a + x_m self.block_forward = block_forward @classmethod def from_pretrained(cls, model_path, config=None, layer_index=None): assert layer_index is not None if config is None: config = GPTConfig.from_pretrained(model_path) # module = cls(config).eval() module = torch.nn.utils.skip_init(cls, config).eval() # fast init try: module.load_state_dict(torch.load(os.path.join( model_path, f'pytorch_{layer_index}.pt', ))) except Exception as e: print('Cannot load from . The model is randomly initialized.') return module def forward(self, x: torch.Tensor, prefix_masks=None, layer_past=None, mask=None, skip_ln=False, **kargs) -> torch.Tensor: if mask is not None: # bool -> float attention_mask = (1e4)*(mask[:, None, None, :]-1.0) else: attention_mask = None if mask is None: if layer_past is not None: offset = layer_past[0].size(2) else: offset = 0 else: # masked tokens offset = (mask-1).sum(-1, keepdims=False) if layer_past is not None: offset += layer_past[0].size(2) if self.training: if self.use_checkpoint: x.requires_grad_(True) x = checkpoint(self.block_forward, x, attention_mask, prefix_masks) else: x = self.block_forward(x, prefix_masks=prefix_masks) return x else: res = x if not skip_ln: x = self.ln_1(x) x_a = self.attn(x, use_cache=False, layer_past=layer_past, attention_mask=attention_mask, offset=offset, prefix_masks=prefix_masks)[0] x_m = self.mlp(x) return x_a + x_m + res class GPTLMHead(nn.Module): def __init__(self, config, device='cpu'): super().__init__() self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon, device=device) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, device=device) @classmethod def from_pretrained(cls, model_path, config=None): if config is None: config = GPTConfig.from_pretrained(model_path) # module = cls(config).eval() module = torch.nn.utils.skip_init(cls, config).eval() # fast init try: module.load_state_dict(torch.load(os.path.join( model_path, 'pytorch_lm_head.pt', ))) except: print('Cannot load from . The model is randomly initialized.') return module def forward(self, x, **kargs): x = self.ln_f(x) x = self.lm_head(x) return x ================================================ FILE: training/modules/hf_gptneox_modules.py ================================================ import os import torch import numpy as np from torch import nn from torch.nn import functional from torch.utils.checkpoint import checkpoint from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, ) from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXAttention as _GPTNeoXAttention from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer as _GPTNeoXBlock from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel as _GPTNeoXModel from transformers.models.gpt_neox.configuration_gpt_neox import GPTNeoXConfig as GPTConfig from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXRotaryEmbedding try: from flash_attn.flash_attention import FlashAttention flash_attn_installed = True print('>>>>> using flash attention') except ImportError: flash_attn_installed = False try: from fav2.fav2_interface import flash_attn_qkvpacked_func as fav2_qkvpacked_func flash_attn_v2_installed = True print('>>>>> using flash attention v2') class FlashAttentionV2(nn.Module): """Implement the scaled dot product attention with softmax. Arguments --------- softmax_scale: The temperature to use for the softmax attention. (default: 1/sqrt(d_keys) where d_keys is computed at runtime) attention_dropout: The dropout rate to apply to the attention (default: 0.0) """ def __init__(self, softmax_scale=None, attention_dropout=0.0): super().__init__() self.softmax_scale = softmax_scale self.dropout_p = attention_dropout def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None, max_s=None, need_weights=False): """Implements the multihead softmax attention. Arguments --------- qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None if unpadded: (nnz, 3, h, d) key_padding_mask: a bool tensor of shape (B, S) """ assert not need_weights assert qkv.dtype in [torch.float16, torch.bfloat16] assert qkv.is_cuda assert key_padding_mask is None assert cu_seqlens is None assert max_s is None output = fav2_qkvpacked_func( qkv, self.dropout_p if self.training else 0.0, softmax_scale=self.softmax_scale, causal=causal ) return output, None except ImportError: flash_attn_v2_installed = False def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] return torch.cat((-x2, x1), dim=-1) def apply_rotary_pos_emb(q, k, cos, sin, offset=0): if isinstance(offset, torch.Tensor): realidx = torch.arange(q.shape[-2], device=q.device).view( 1, q.shape[-2]) + offset[:, None] cos = cos.squeeze(0).squeeze(0)[realidx].view(offset.size(0), 1, q.shape[-2], cos.size(-1)) sin = sin.squeeze(0).squeeze(0)[realidx].view(offset.size(0), 1, q.shape[-2], sin.size(-1)) else: cos = cos[..., offset : q.shape[-2] + offset, :] sin = sin[..., offset : q.shape[-2] + offset, :] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed class GPTNeoXAttention(_GPTNeoXAttention): def __init__(self, config): super(_GPTNeoXAttention, self).__init__() self.num_attention_heads = config.num_attention_heads self.hidden_size = config.hidden_size self.head_size = self.hidden_size // self.num_attention_heads self.rotary_ndims = int(self.head_size * config.rotary_pct) max_positions = config.max_position_embeddings self.register_buffer( "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( 1, 1, max_positions, max_positions ), ) self.register_buffer("masked_bias", torch.tensor(-1e9)) self.rotary_emb = GPTNeoXRotaryEmbedding( self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base ) self.register_buffer( "norm_factor", torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()), persistent=False, ) self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size) self.dense = nn.Linear(config.hidden_size, config.hidden_size) if flash_attn_v2_installed: self.flash_attn = FlashAttentionV2(softmax_scale=1.0/self.norm_factor, attention_dropout = 0) elif flash_attn_installed: self.flash_attn = FlashAttention(softmax_scale=1.0/self.norm_factor, attention_dropout = 0) def forward( self, hidden_states, attention_mask, head_mask=None, layer_past=None, use_cache=False, offset=None, output_attentions=False, ): bsz, tgt_len, _ = hidden_states.shape has_layer_past = layer_past is not None # Compute QKV # Attention heads [batch, seq_len, hidden_size] # --> [batch, seq_len, (np * 3 * head_size)] qkv = self.query_key_value(hidden_states) # [batch, seq_len, (num_heads * 3 * head_size)] # --> [batch, seq_len, num_heads, 3 * head_size] new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size) qkv = qkv.view(*new_qkv_shape) # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size] query = qkv[..., :self.head_size].permute(0, 2, 1, 3) key = qkv[..., self.head_size:2 * self.head_size].permute(0, 2, 1, 3) value = qkv[..., 2 * self.head_size:].permute(0, 2, 1, 3) # Compute rotary embeddings on rotary_ndims query_rot = query[..., :self.rotary_ndims] query_pass = query[..., self.rotary_ndims:] key_rot = key[..., :self.rotary_ndims] key_pass = key[..., self.rotary_ndims:] # Compute token offset for rotary embeddings (when decoding) seq_len = key.shape[-2] if layer_past is not None: if offset is None: offset = layer_past[0].shape[-2] seq_len += layer_past[0].shape[-2] if offset is None: offset = 0 cos, sin = self.rotary_emb(value, seq_len=seq_len) query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, offset=offset) query = torch.cat((query, query_pass), dim=-1) key = torch.cat((key, key_pass), dim=-1) # Cache QKV values if has_layer_past: past_key = layer_past[0] past_value = layer_past[1] key = torch.cat((past_key, key), dim=-2) value = torch.cat((past_value, value), dim=-2) present = None if use_cache else (key, value) # Compute attention if flash_attn_installed or flash_attn_v2_installed: query = query.permute(0, 2, 1, 3).half() key = key.permute(0, 2, 1, 3).half() value = value.permute(0, 2, 1, 3).half() qkv = torch.stack( [ query.reshape((bsz, tgt_len, self.num_attention_heads, self.head_size)), key.reshape((bsz, tgt_len, self.num_attention_heads, self.head_size)), value.reshape((bsz, tgt_len, self.num_attention_heads, self.head_size)), ], dim=2 ) attn_weights = None attn_output, _ = self.flash_attn(qkv, causal=True) attn_output = attn_output.view(bsz, tgt_len, self.num_attention_heads * self.head_size) else: attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) # Reshape outputs attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size) attn_output = self.dense(attn_output) outputs = (attn_output, present) if output_attentions: outputs += (attn_weights, ) return outputs # fix nan problem def _attn(self, query, key, value, attention_mask=None, head_mask=None): # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size] # compute causal mask from causal mask buffer batch_size, num_attention_heads, query_length, attn_head_size = query.size( ) key_length = key.size(-2) causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length].bool() query = query.view(batch_size * num_attention_heads, query_length, attn_head_size) key = key.view(batch_size * num_attention_heads, key_length, attn_head_size) attn_scores = torch.zeros( # empty sometimes gives nan batch_size * num_attention_heads, query_length, key_length, dtype=query.dtype, device=key.device, ) attn_scores = torch.baddbmm( attn_scores, query, key.transpose(1, 2), beta=0.0, alpha=(1.0 / self.norm_factor), ) attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length) mask_value = torch.finfo(attn_scores.dtype).min # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype).to( attn_scores.device) attn_scores = torch.where(causal_mask, attn_scores, mask_value) if attention_mask is not None: # Apply the attention mask attn_scores = attn_scores + attention_mask attn_weights = nn.functional.softmax(attn_scores, dim=-1) attn_weights = attn_weights.to(value.dtype) # Mask heads if we want to if head_mask is not None: attn_weights = attn_weights * head_mask attn_output = torch.matmul(attn_weights, value) return attn_output, attn_weights class GPTEmbeddings(nn.Module): def __init__(self, config): super().__init__() self.config = config self.embed_dim = config.hidden_size self.embed_in = nn.Embedding(config.vocab_size, self.embed_dim) @classmethod def from_pretrained(cls, model_path, config=None): if config is None: config = GPTConfig.from_pretrained(model_path) module = cls(config).eval() try: module.load_state_dict( torch.load(os.path.join( model_path, 'pytorch_embs.pt', ))) except: print( f'Cannot load from . The model is randomly initialized.' ) return module def forward(self, input_ids, *args, **kargs): # input ids input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) hidden_states = self.embed_in(input_ids) return hidden_states class GPTBlock(_GPTNeoXBlock): def __init__(self, config, *args, use_checkpoint=True, **kargs): super(_GPTNeoXBlock, self).__init__() self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.attention = GPTNeoXAttention(config) self.mlp = GPTNeoXMLP(config) self.config = config self.use_checkpoint = use_checkpoint def block_forward(x: torch.Tensor, attention_mask: torch.Tensor, prefix_masks: torch.Tensor) -> torch.Tensor: res = x """ To be compatible with https://github.com/huggingface/transformers/blob/a0ae2310ec46a2c592950babc85cf02e325bf6a7/src/transformers/models/gpt_neox/modeling_gpt_neox.py#L336-L347 """ layer_norm_out = self.input_layernorm(x) attention_layer_output = self.attention(layer_norm_out, attention_mask=attention_mask) attn_output = attention_layer_output[0] # outputs = attention_layer_output[1:] if hasattr(self.config, 'use_parallel_residual') and self.config.use_parallel_residual: # x = x + attn(ln1(x)) + mlp(ln2(x)) # x_a = attn_output, mlp_out = self.mlp(self.post_attention_layernorm(x)) return res + attn_output + mlp_out else: # x = x + attn(ln1(x)) # x = x + mlp(ln2(x)) attn_output = attn_output + x mlp_out = self.mlp(self.post_attention_layernorm(attn_output)) return attn_output + mlp_out self.block_forward = block_forward @classmethod def from_pretrained(cls, model_path, config=None, layer_index=None): assert layer_index is not None if config is None: config = GPTConfig.from_pretrained(model_path) module = cls(config).eval().half() try: module.load_state_dict( torch.load( os.path.join( model_path, f'pytorch_{layer_index}.pt', ))) except Exception as e: print( 'Cannot load from . The model is randomly initialized.' ) return module def forward(self, x: torch.Tensor, layer_past=None, mask=None, **kargs) -> torch.Tensor: if mask is not None: # bool -> float attention_mask = 1e9 * (mask[:, None, None, :] - 1) else: attention_mask = None if mask is None: if layer_past is not None: offset = layer_past[0].size(2) else: offset = 0 else: # masked tokens offset = (mask - 1).sum(-1, keepdims=False) if layer_past is not None: offset += layer_past[0].size(2) if self.training: if self.use_checkpoint: x.requires_grad_(True) x = checkpoint(self.block_forward, x, attention_mask, None) else: x = self.block_forward(x, prefix_masks=prefix_masks) return x else: residual = x ln_out = self.input_layernorm(x) attention_layer_outputs = self.attention( ln_out, attention_mask=attention_mask, ) attn_output = attention_layer_outputs[ 0] # output_attn: a, present, ... mlp_output = self.mlp(self.post_attention_layernorm(x)) x = mlp_output + attn_output + residual return x class GPTLMHead(nn.Module): def __init__(self, config): super().__init__() self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False) @classmethod def from_pretrained(cls, model_path, config=None): if config is None: config = GPTConfig.from_pretrained(model_path) module = cls(config).eval() try: module.load_state_dict( torch.load(os.path.join( model_path, 'pytorch_lm_head.pt', ))) except: print( 'Cannot load from . The model is randomly initialized.' ) return module def forward(self, x, *args, **kargs): x = self.final_layer_norm(x) x = self.embed_out(x) return x ================================================ FILE: training/modules/hf_opt_modules.py ================================================ from typing import List, Optional, Tuple, Union import os import torch from torch import nn from torch.utils.checkpoint import checkpoint import torch.nn.functional as F from transformers.models.opt.modeling_opt import ACT2FN from transformers.models.opt.modeling_opt import OPTDecoderLayer from transformers.models.opt.modeling_opt import OPTAttention as _OPTAttention from transformers.models.opt.modeling_opt import OPTLearnedPositionalEmbedding from transformers.models.opt.configuration_opt import OPTConfig as GPTConfig def _make_causal_mask( input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 ): """ Make causal mask used for bi-directional self-attention. """ bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), torch.tensor(float("-inf")), device=device) mask_cond = torch.arange(mask.size(-1), device=device) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = torch.cat( [torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1 ) return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] combined_attention_mask = None if input_shape[-1] > 1: combined_attention_mask = _make_causal_mask( input_shape, inputs_embeds.dtype, inputs_embeds.device, past_key_values_length=past_key_values_length ) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] expanded_attn_mask = _expand_mask( attention_mask, inputs_embeds.dtype,tgt_len=input_shape[-1]) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask ) return combined_attention_mask class GPTEmbeddings(nn.Module): def __init__(self, config, device='cpu'): super().__init__() self.config = config self.padding_idx = config.pad_token_id self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx, device=device) self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size) if config.word_embed_proj_dim != config.hidden_size: self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False, device=device) else: self.project_in = None @classmethod def from_pretrained(cls, model_path, config=None): if config is None: config = GPTConfig.from_pretrained(model_path) # module = cls(config).eval() module = torch.nn.utils.skip_init(cls, config).eval() # fast init try: module.load_state_dict(torch.load(os.path.join( model_path, 'pytorch_embs.pt', ))) except: print('Cannot load from . The model is randomly initialized.') return module def forward(self, input_ids, past_layer=None, mask=None, **kargs): if mask is None: if past_layer is not None: past_length = past_layer[0].size(2) else: past_length = 0 else: # masked tokens past_length = (mask-1).sum(-1, keepdims=True) if past_layer is not None: past_length += past_layer[0].size(2) device = input_ids.device # input ids input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] inputs_embeds = self.embed_tokens(input_ids) # attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device) # position_embeds = self.embed_positions(attention_mask, past_length) # position ids position_ids = torch.arange( 0, input_shape[-1], dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) position_ids = position_ids + past_length + self.embed_positions.offset position_ids[position_ids<0] = 0 position_embeds = F.embedding( position_ids, self.embed_positions.weight, self.embed_positions.padding_idx, self.embed_positions.max_norm, self.embed_positions.norm_type, self.embed_positions.scale_grad_by_freq, self.embed_positions.sparse) if self.project_in is not None: inputs_embeds = self.project_in(inputs_embeds) hidden_states = inputs_embeds + position_embeds # hidden_states = self.drop(hidden_states) return hidden_states class OPTAttention(_OPTAttention): def __init__( self, embed_dim: int, num_heads: int, dropout: float = 0.0, is_decoder: bool = False, bias: bool = True, device='cpu', ): super(_OPTAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias, device=device) self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias, device=device) self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias, device=device) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, device=device) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def forward( self, hidden_states: torch.Tensor, key_value_states: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, layer_head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel""" # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling # get key, value proj if is_cross_attention and past_key_value is not None: # reuse k,v, cross_attentions key_states = past_key_value[0] value_states = past_key_value[1] elif is_cross_attention: # cross_attentions key_states = self._shape(self.k_proj(key_value_states), -1, bsz) value_states = self._shape(self.v_proj(key_value_states), -1, bsz) elif past_key_value is not None: # reuse k, v, self_attention key_states = self._shape(self.k_proj(hidden_states), -1, bsz) value_states = self._shape(self.v_proj(hidden_states), -1, bsz) key_states = torch.cat([past_key_value[0], key_states], dim=2) value_states = torch.cat([past_key_value[1], value_states], dim=2) else: # self_attention key_states = self._shape(self.k_proj(hidden_states), -1, bsz) value_states = self._shape(self.v_proj(hidden_states), -1, bsz) if self.is_decoder: # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention # key/value_states (first "if" case) # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of # all previous decoder key/value_states. Further calls to uni-directional self-attention # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) # if encoder bi-directional self-attention `past_key_value` is always `None` past_key_value = (key_states, value_states) proj_shape = (bsz * self.num_heads, -1, self.head_dim) query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) key_states = key_states.view(*proj_shape) value_states = value_states.view(*proj_shape) src_len = key_states.size(1) attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): raise ValueError( f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" f" {attn_weights.size()}" ) if attention_mask is not None: if attention_mask.size() != (bsz, 1, tgt_len, src_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) dtype_attn_weights = attn_weights.dtype # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437 if dtype_attn_weights == torch.float16: attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(dtype_attn_weights) else: attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): raise ValueError( f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" f" {layer_head_mask.size()}" ) attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) if output_attentions: # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to be reshaped # twice and have to be reused in the following attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) else: attn_weights_reshaped = None attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): raise ValueError( f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" f" {attn_output.size()}" ) attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) return attn_output, attn_weights_reshaped, past_key_value class GPTBlock(OPTDecoderLayer): def __init__(self, config, *args, use_checkpoint=True, device='cpu', **kargs): # super().__init__(config=config, *args, **kargs) super(OPTDecoderLayer, self).__init__() self.embed_dim = config.hidden_size self.self_attn = OPTAttention( embed_dim=self.embed_dim, num_heads=config.num_attention_heads, dropout=config.attention_dropout, is_decoder=True, device=device, ) self.do_layer_norm_before = config.do_layer_norm_before self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, device=device) self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, device=device) self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, device=device) self.final_layer_norm = nn.LayerNorm(self.embed_dim, device=device) self.config = config self.use_checkpoint = use_checkpoint def attn_res(hidden_states: torch.Tensor, attention_mask=None) -> torch.Tensor: residual = hidden_states if self.do_layer_norm_before: hidden_states = self.self_attn_layer_norm(hidden_states) # Self Attention hidden_states, _, present = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, ) hidden_states = residual + hidden_states # 350m applies layer norm AFTER attention if not self.do_layer_norm_before: hidden_states = self.self_attn_layer_norm(hidden_states) return hidden_states self.attn_res = attn_res def mlp_res(hidden_states: torch.Tensor) -> torch.Tensor: # Fully Connected hidden_states_shape = hidden_states.shape hidden_states = hidden_states.reshape(-1, hidden_states.size(-1)) residual = hidden_states # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention if self.do_layer_norm_before: hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.fc1(hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(hidden_states) hidden_states = (residual + hidden_states).view(hidden_states_shape) return hidden_states self.mlp_res = mlp_res @classmethod def from_pretrained(cls, model_path, config=None, layer_index=None): assert layer_index is not None if config is None: config = GPTConfig.from_pretrained(model_path) # module = cls(config).eval() # module = cls(config).eval() module = torch.nn.utils.skip_init(cls, config).eval() # fast init try: module.load_state_dict(torch.load(os.path.join( model_path, f'pytorch_{layer_index}.pt', ))) except: print('Cannot load from . The model is randomly initialized.') return module def forward(self, x: torch.Tensor, layer_past=None, mask=None, *args, **kargs) -> torch.Tensor: if layer_past is not None: past_length = layer_past[0].size(2) else: past_length = 0 if mask is None: mask = torch.ones((x.size(0), x.size(1)+past_length), dtype=torch.bool, device=x.device) attention_mask = _prepare_decoder_attention_mask( mask, x.shape[:2], x, past_length ) if self.training: if self.use_checkpoint: x.requires_grad_(True) x = checkpoint(self.attn_res, x, attention_mask) else: x = self.attn_res(x, attention_mask) if self.use_checkpoint: x.requires_grad_(True) x = checkpoint(self.mlp_res, x) else: x = self.mlp_res(x) return x else: hidden_states = x # alias residual = hidden_states # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention if self.do_layer_norm_before: hidden_states = self.self_attn_layer_norm(hidden_states) # Self Attention hidden_states, _, present = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, past_key_value=layer_past, ) hidden_states = residual + hidden_states # 350m applies layer norm AFTER attention if not self.do_layer_norm_before: hidden_states = self.self_attn_layer_norm(hidden_states) # Fully Connected hidden_states_shape = hidden_states.shape hidden_states = hidden_states.reshape(-1, hidden_states.size(-1)) residual = hidden_states # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention if self.do_layer_norm_before: hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.fc1(hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(hidden_states) hidden_states = (residual + hidden_states).view(hidden_states_shape) return hidden_states class GPTLMHead(nn.Module): def __init__(self, config, device='cpu'): super().__init__() if config.do_layer_norm_before and not config._remove_final_layer_norm: self.final_layer_norm = nn.LayerNorm(config.hidden_size, device=device) else: self.final_layer_norm = None if config.word_embed_proj_dim != config.hidden_size: self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False, device=device) else: self.project_out = None self.lm_head = nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False, device=device) @classmethod def from_pretrained(cls, model_path, config=None): if config is None: config = GPTConfig.from_pretrained(model_path) # module = cls(config).eval() module = torch.nn.utils.skip_init(cls, config).eval() # fast init try: module.load_state_dict(torch.load(os.path.join( model_path, 'pytorch_lm_head.pt', ))) except: print('Cannot load from . The model is randomly initialized.') return module def forward(self, x, input_ids=None, *args, **kargs): if self.final_layer_norm is not None: x = self.final_layer_norm(x) if self.project_out is not None: x = self.project_out(x) x = self.lm_head(x) return x ================================================ FILE: training/modules/llama_modules.py ================================================ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its # original forms to accommodate minor architectural differences compared # to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Llama model.""" import os from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss from torch.utils.checkpoint import checkpoint from transformers.activations import ACT2FN from transformers.modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, ) from transformers.modeling_utils import PreTrainedModel from transformers.utils import add_start_docstrings, logging, replace_return_docstrings from transformers import LlamaConfig from flash_attn.layers.rotary import ( apply_rotary_emb_func, apply_rotary_emb_qkv_, apply_rotary_emb_kv_, ) class RotaryEmbedding(torch.nn.Module): """ The rotary position embeddings from RoFormer_ (Su et. al). A crucial insight from the method is that the query and keys are transformed by rotation matrices which depend on the relative positions. Other implementations are available in the Rotary Transformer repo_ and in GPT-NeoX_, GPT-NeoX was an inspiration .. _RoFormer: https://arxiv.org/abs/2104.09864 .. _repo: https://github.com/ZhuiyiTechnology/roformer .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554). A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96 Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py """ def __init__( self, dim: int, base=10000.0, interleaved=False, scale_base=None, scaling_factor=1.0, pos_idx_in_fp32=True, device=None, ): """ interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of 1st half and 2nd half (GPT-NeoX style). pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32, otherwise they might be in lower precision. This option was added because previously (before 2023-07-02), when we construct the position indices, we use the dtype of self.inv_freq. In most cases this would be fp32, but if the model is trained in pure bf16 (not mixed precision), then self.inv_freq would be bf16, and the position indices are also in bf16. Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the embeddings for some positions will coincide. To maintain compatibility with models previously trained in pure bf16, we add this option. """ super().__init__() self.dim = dim self.base = float(base) self.pos_idx_in_fp32 = pos_idx_in_fp32 # Generate and save the inverse frequency buffer (non trainable) inv_freq = self._compute_inv_freq(device) self.register_buffer("inv_freq", inv_freq, persistent=False) self.interleaved = interleaved self.scale_base = scale_base self.scaling_factor = scaling_factor scale = ( (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim) if scale_base is not None else None ) self.register_buffer("scale", scale, persistent=False) self._seq_len_cached = 0 self._cos_cached = None self._sin_cached = None self._cos_k_cached = None self._sin_k_cached = None def _compute_inv_freq(self, device=None): return 1.0 / ( self.base ** ( torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim ) ) def _update_cos_sin_cache(self, seqlen, device=None, dtype=None): # Reset the tables if the sequence length has changed, # if we're on a new device (possibly due to tracing for instance), # or if we're switching from inference mode to training if ( seqlen > self._seq_len_cached or self._cos_cached.device != device or self._cos_cached.dtype != dtype or (self.training and self._cos_cached.is_inference()) ): self._seq_len_cached = seqlen # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16 # And the output of arange can be quite large, so bf16 would lose a lot of precision. # However, for compatibility reason, we add an option to use the dtype of self.inv_freq. if self.pos_idx_in_fp32: t = torch.arange(seqlen, device=device, dtype=torch.float32) t /= self.scaling_factor # We want fp32 here as well since inv_freq will be multiplied with t, and the output # will be large. Having it in bf16 will lose a lot of precision and cause the # cos & sin output to change significantly. # We want to recompute self.inv_freq if it was not loaded in fp32 if self.inv_freq.dtype != torch.float32: inv_freq = self._compute_inv_freq(device=device) else: inv_freq = self.inv_freq else: t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) t /= self.scaling_factor inv_freq = self.inv_freq # Don't do einsum, it converts fp32 to fp16 under AMP # freqs = torch.einsum("i,j->ij", t, self.inv_freq) freqs = torch.outer(t, inv_freq) if self.scale is None: self._cos_cached = torch.cos(freqs).to(dtype) self._sin_cached = torch.sin(freqs).to(dtype) else: power = ( torch.arange( seqlen, dtype=self.scale.dtype, device=self.scale.device ) - seqlen // 2 ) / self.scale_base scale = self.scale.to(device=power.device) ** rearrange( power, "s -> s 1" ) # We want the multiplication by scale to happen in fp32 self._cos_cached = (torch.cos(freqs) * scale).to(dtype) self._sin_cached = (torch.sin(freqs) * scale).to(dtype) self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype) self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype) def forward( self, qkv: torch.Tensor, kv: Optional[torch.Tensor] = None, seqlen_offset: int = 0, ) -> Tuple[torch.Tensor, torch.Tensor]: """ qkv: (batch, seqlen, 3, nheads, headdim) if kv is none, else it's just q of shape (batch, seqlen, nheads, headdim) kv: (batch, seqlen, 2, nheads, headdim) seqlen_offset: can be used in generation where the qkv being passed in is only the last token in the batch. """ seqlen = qkv.shape[1] self._update_cos_sin_cache( seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype ) if kv is None: if self.scale is None: return apply_rotary_emb_qkv_( qkv, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:], None, None, self.interleaved, ) else: return apply_rotary_emb_qkv_( qkv, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:], self._cos_k_cached[seqlen_offset:], self._sin_k_cached[seqlen_offset:], self.interleaved, ) else: q = qkv q = apply_rotary_emb_func( q, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:], self.interleaved, True, ) if self.scale is None: kv = apply_rotary_emb_kv_( kv, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:], self.interleaved, ) else: kv = apply_rotary_emb_kv_( kv, self._cos_k_cached[seqlen_offset:], self._sin_k_cached[seqlen_offset:], self.interleaved, ) return q, kv try: from flash_attn.flash_attn_interface import flash_attn_qkvpacked_func flash_attn_v2_installed = True print(">>>>> using flash attention v2") class FlashAttentionV2(nn.Module): """Implement the scaled dot product attention with softmax. Arguments --------- softmax_scale: The temperature to use for the softmax attention. (default: 1/sqrt(d_keys) where d_keys is computed at runtime) attention_dropout: The dropout rate to apply to the attention (default: 0.0) """ def __init__(self, softmax_scale=None, attention_dropout=0.0): super().__init__() self.softmax_scale = softmax_scale self.dropout_p = attention_dropout def forward( self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None, max_s=None, need_weights=False, ): """Implements the multihead softmax attention. Arguments --------- qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None if unpadded: (nnz, 3, h, d) key_padding_mask: a bool tensor of shape (B, S) """ assert not need_weights assert qkv.dtype in [torch.float16, torch.bfloat16] assert qkv.is_cuda assert key_padding_mask is None assert cu_seqlens is None assert max_s is None output = flash_attn_qkvpacked_func( qkv, self.dropout_p if self.training else 0.0, softmax_scale=self.softmax_scale, causal=causal, ) return output, None except ImportError: flash_attn_v2_installed = False try: import xformers.ops as xops xops_installed = True print(">>>>> Xformers installed") except: xops_installed = False logger = logging.get_logger(__name__) def _make_causal_mask( input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0 ): """ Make causal mask used for bi-directional self-attention. """ bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min)) mask_cond = torch.arange(mask.size(-1)) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = torch.cat( [torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1 ) return mask[None, None, :, :].expand( bsz, 1, tgt_len, tgt_len + past_key_values_length ) def _make_causal_mask_device( input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0, ): """ Make causal mask used for bi-directional self-attention. """ bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), torch.tensor(float("-inf")), device=device) mask_cond = torch.arange(mask.size(-1), device=device) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = torch.cat( [ torch.zeros( tgt_len, past_key_values_length, dtype=dtype, device=device ), mask, ], dim=-1, ) return mask[None, None, :, :].expand( bsz, 1, tgt_len, tgt_len + past_key_values_length ) def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill( inverted_mask.to(torch.bool), torch.finfo(dtype).min ) def _prepare_decoder_attention_mask( attention_mask, input_shape, inputs_embeds, past_key_values_length ): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] combined_attention_mask = None if input_shape[-1] > 1: combined_attention_mask = _make_causal_mask_device( input_shape, inputs_embeds.dtype, inputs_embeds.device, past_key_values_length=past_key_values_length, ) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] expanded_attn_mask = _expand_mask( attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] ) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask ) return combined_attention_mask # @torch.jit.script def rmsnorm_func(hidden_states, weight, variance_epsilon): input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon) return weight * hidden_states.to(input_dtype) class RMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ LlamaRMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.register_buffer( "variance_epsilon", torch.tensor(eps), persistent=False, ) def forward(self, hidden_states): return rmsnorm_func(hidden_states, self.weight, self.variance_epsilon) class LlamaMLP(nn.Module): def __init__( self, hidden_size: int, intermediate_size: int, hidden_act: str, ): super().__init__() self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.act_fn = ACT2FN[hidden_act] def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) class LlamaAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( self, hidden_size: int, num_heads: int, config, ): super().__init__() self.hidden_size = hidden_size self.num_heads = num_heads self.head_dim = hidden_size // num_heads max_positions = config.max_position_embeddings self.max_positions = max_positions self.config = config if (self.head_dim * num_heads) != self.hidden_size: raise ValueError( f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" f" and `num_heads`: {num_heads})." ) self.q_proj = nn.Linear( hidden_size, num_heads * self.head_dim, bias=False, ) self.k_proj = nn.Linear( hidden_size, num_heads * self.head_dim, bias=False, ) self.v_proj = nn.Linear( hidden_size, num_heads * self.head_dim, bias=False, ) self.o_proj = nn.Linear( num_heads * self.head_dim, hidden_size, bias=False, ) self.rotary_ndims = self.head_dim self.register_buffer( "norm_factor", torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to( torch.get_default_dtype() ), persistent=False, ) if self.config.rope_scaling is None: # by default do linear scale if not specified. scaling_factor = max(self.max_positions / 4096, 1.0) print(f"Linearly scaling {scaling_factor}x.") else: scaling_type = self.config.rope_scaling["type"] scaling_factor = self.config.rope_scaling["factor"] assert scaling_type == "linear" self.rotary_emb = RotaryEmbedding( self.rotary_ndims, base=10000, interleaved=False, scaling_factor=scaling_factor, ) if flash_attn_v2_installed: self.flash_attn = FlashAttentionV2( softmax_scale=1.0 / self.norm_factor, attention_dropout=0 ) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return ( tensor.view(bsz, seq_len, self.num_heads, self.head_dim) .transpose(1, 2) .contiguous() ) def forward( self, hidden_states: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel""" bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states).view( bsz, q_len, self.num_heads, self.head_dim ) key_states = self.k_proj(hidden_states).view( bsz, q_len, self.num_heads, self.head_dim ) value_states = self.v_proj(hidden_states).view( bsz, q_len, self.num_heads, self.head_dim ) qkv = torch.stack([query_states, key_states, value_states], dim=2) qkv = self.rotary_emb(qkv) if flash_attn_v2_installed: attn_output, _ = self.flash_attn(qkv, causal=True) elif xops_installed: q, k, v = qkv.unbind(2) attn_output = xops.memory_efficient_attention( q, k, v, attn_bias=xops.LowerTriangularMask() ) elif flash_attn_installed: attn_output, _ = self.flash_attn(qkv, causal=True) else: raise Exception("Flash Attention not found.") attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim) attn_output = self.o_proj(attn_output) return attn_output, None, None class LlamaDecoderLayer(nn.Module): def __init__(self, config: LlamaConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = LlamaAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, config=config, ) self.mlp = LlamaMLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm( config.hidden_size, eps=config.rms_norm_eps ) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, past_key_value: Optional[Tuple[torch.Tensor]] = None, ) -> Tuple[ torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] ]: """ Args: hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`, *optional*): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states """ residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, past_key_value=past_key_value, attention_mask=attention_mask, output_attentions=output_attentions, ) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: outputs += (present_key_value,) return outputs class GPTEmbeddings(nn.Module): def __init__(self, config, device="cpu"): super().__init__() self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = nn.Embedding( config.vocab_size, config.hidden_size, self.padding_idx ) def forward( self, input_ids, *args, **kargs, ): inputs_embeds = self.embed_tokens(input_ids) return inputs_embeds class GPTLMHead(nn.Module): def __init__(self, config, device="cpu"): super().__init__() self.config = config self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) def forward( self, hidden_states, *args, **kargs, ): hidden_states = self.norm(hidden_states) logits = self.lm_head(hidden_states) return logits class GPTBlock(nn.Module): def __init__(self, config: LlamaConfig, *args, **kargs): super().__init__() self.hidden_size = config.hidden_size self.self_attn = LlamaAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, config=config, ) self.mlp = LlamaMLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm( config.hidden_size, eps=config.rms_norm_eps ) def attn_res(hidden_states: torch.Tensor, attention_mask=None) -> torch.Tensor: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, past_key_value=None, attention_mask=attention_mask, ) hidden_states = residual + hidden_states return hidden_states self.attn_res = attn_res def mlp_res(hidden_states: torch.Tensor) -> torch.Tensor: # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states return hidden_states self.mlp_res = mlp_res self.use_checkpoint = True def forward( self, x: torch.Tensor, layer_past=None, mask=None, *args, **kargs ) -> torch.Tensor: if layer_past is not None: past_length = layer_past[0].size(2) else: past_length = 0 if mask is None: mask = torch.ones( (x.size(0), x.size(1) + past_length), dtype=torch.bool, device=x.device ) attention_mask = None if self.use_checkpoint: x.requires_grad_(True) x = checkpoint(self.attn_res, x, attention_mask) else: x = self.attn_res(x, attention_mask) if self.use_checkpoint: x.requires_grad_(True) x = checkpoint(self.mlp_res, x) else: x = self.mlp_res(x) return x ================================================ FILE: training/modules/task_modules.py ================================================ import torch class GlueClassification(torch.nn.Module): def __init__(self, model_dim, num_classes): super(GlueClassification, self).__init__() self.model_dim = model_dim self.num_classes = num_classes self.pooler_layer = torch.nn.Linear(model_dim, model_dim) self.fc_layer = torch.nn.Linear(model_dim, num_classes) def forward(self, hidden_states, pooler_index=0): pooled = hidden_states[:, pooler_index, :] pooled = self.pooler_layer(pooled) pooled = torch.tanh(pooled) return self.fc_layer(pooled) ================================================ FILE: training/modules/tokenizer.py ================================================ from transformers import AutoTokenizer, GPT2TokenizerFast, DebertaV2Tokenizer def build_tokenizer(args): tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token return tokenizer def build_gpt2_tokenizer(args): tokenizer = GPT2TokenizerFast.from_pretrained(args.tokenizer_name) tokenizer.pad_token = tokenizer.eos_token return tokenizer def build_deberta_tokenizer(args): tokenizer = DebertaV2Tokenizer.from_pretrained(args.tokenizer_name) return tokenizer ================================================ FILE: training/modules/utils.py ================================================ import torch import math import numpy as np from torch import nn from torch.nn import functional from typing import Optional, Tuple, Union # @torch.jit.script def gpt_loss_func(input, target): lm_logits, labels = input, target shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() loss = functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return loss ================================================ FILE: training/optimizer/__init__.py ================================================ ================================================ FILE: training/optimizer/grad_scalar.py ================================================ from abc import ABC from abc import abstractmethod import torch class GradScaler(ABC): def __init__(self, initial_scale, device=None): """Initialize scale value with the input initial scale.""" assert initial_scale > 0.0 self.device = device self._scale = torch.cuda.FloatTensor([initial_scale], device=device) @property def scale(self): return self._scale @property def inv_scale(self): return self._scale.double().reciprocal().float() @abstractmethod def update(self, found_inf): pass @abstractmethod def state_dict(self): pass @abstractmethod def load_state_dict(self, state_dict): pass class ConstantGradScaler(GradScaler): def update(self, found_inf): pass def state_dict(self): return dict() def load_state_dict(self, state_dict): pass class DynamicGradScaler(GradScaler): def __init__(self, initial_scale, min_scale, growth_factor, backoff_factor, growth_interval, hysteresis, device=None): """"Grad scaler with dynamic scale that gets adjusted during training.""" super(DynamicGradScaler, self).__init__(initial_scale, device=device) # Lower bound on the scale. assert min_scale > 0.0 assert min_scale <= initial_scale self.min_scale = torch.cuda.FloatTensor([min_scale], device=device) # Growth and backoff factors for the scale. assert growth_factor > 1.0 self.growth_factor = torch.cuda.FloatTensor([growth_factor], device=device) assert backoff_factor < 1.0 assert backoff_factor > 0.0 self.backoff_factor = torch.cuda.FloatTensor([backoff_factor], device=device) # Interval over which if we don't see any inf/nan, # we will scale the grad scale by the growth factor. assert growth_interval > 0 self.growth_interval = growth_interval # Number of inf/nans we should see before scaling down # the grad scale by the backoff factor. assert hysteresis > 0 self.hysteresis = hysteresis # Trackers. self._growth_tracker = 0 self._hysteresis_tracker = self.hysteresis def update(self, found_inf): # If we have an inf/nan, growth tracker is set to 0 # and hysterisis tracker is reduced by 1. if found_inf: self._growth_tracker = 0 self._hysteresis_tracker -= 1 # Now if we are out of hysteresis count, scale down the loss. if self._hysteresis_tracker <= 0: self._scale = torch.max(self._scale * self.backoff_factor, self.min_scale) print('##### scale backoff to', self._scale) else: # If there is no nan/inf, increment the growth tracker. self._growth_tracker += 1 # If we have had enough consequitive intervals with no nan/inf: if self._growth_tracker == self.growth_interval: # Reset the tracker and hysteresis trackers, self._growth_tracker = 0 self._hysteresis_tracker = self.hysteresis # and scale up the loss scale. self._scale = self._scale * self.growth_factor print('##### scale grow to', self._scale) def state_dict(self): state_dict = {} state_dict['scale'] = self._scale state_dict['growth_tracker'] = self._growth_tracker state_dict['hysteresis_tracker'] = self._hysteresis_tracker return state_dict def load_state_dict(self, state_dict): self._scale = state_dict['scale'].to(self.device) self._growth_tracker = state_dict['growth_tracker'] self._hysteresis_tracker = state_dict['hysteresis_tracker'] ================================================ FILE: training/optimizer/optimizer.py ================================================ import torch from .grad_scalar import * # This follows some implementation from Megatron def _has_overflow_serial(grads): def _has_inf_or_nan(x): try: # if x is half, the .float() incurs an additional deep copy, but it's necessary if # Pytorch's .sum() creates a one-element tensor of the same type as x # (which is true for some recent version of pytorch). cpu_sum = float(x.float().sum()) # More efficient version that can be used if .sum() returns a Python scalar # cpu_sum = float(x.sum()) except RuntimeError as instance: # We want to check if inst is actually an overflow exception. # RuntimeError could come from a different error. # If so, we still want the exception to propagate. if "value cannot be converted" not in instance.args[0]: raise return True else: if cpu_sum in [float('inf'), -float('inf')] or cpu_sum != cpu_sum: return True return False for p in grads: if _has_inf_or_nan(p): return torch.FloatTensor([1.0]) return torch.FloatTensor([0.0]) # `x` is a torch.Tensor def _zero_grad_group(group, set_to_none): """Zero out the gradient for a group of parameters. Note: copied from torch.optim.optimizer.""" for param in group: if param.grad is not None: if set_to_none: param.grad = None else: if param.grad.grad_fn is not None: param.grad.detach_() else: param.grad.requires_grad_(False) param.grad.zero_() ''' def _multi_tensor_copy_this_to_that(this, that): for this_, that_ in zip(this, that): that_.copy_(this_) ''' class Fp16Optimizer: # If offload is set to true, the fp32 copy is stored on CPU. def __init__(self, optimizer, grad_scaler, device, offload=False): self.offload = offload if self.offload: self.cpu_to_gpu_stream = torch.cuda.Stream(device=device, priority=-1) self.gpu_to_cpu_stream = torch.cuda.Stream(device=device, priority=-1) self.optimizer = optimizer self.grad_scaler = grad_scaler if self.grad_scaler: self.found_inf = torch.cuda.FloatTensor([0.0], device=device) if not self.offload else torch.FloatTensor([0.0]) self._dummy_overflow_buf = torch.cuda.IntTensor([0], device=device) if not self.offload else torch.IntTensor([0]) # Note that the model should first be cast to fp16 before passing to the optimizer. self.float16_groups = [] self.fp32_from_float16_groups = [] # For all the groups in the original optimizer: for param_group in self.optimizer.param_groups: float16_params_this_group = [] fp32_from_float16_params_this_group = [] # For all the parameters in this group: for i, param in enumerate(param_group['params']): if param.requires_grad: # float16 params: assert param.type() == 'torch.cuda.HalfTensor' float16_params_this_group.append(param) # Create a copy if self.offload: optimizer_param = param.detach().clone().float().to(device='cpu') assert optimizer_param.device == torch.device('cpu') if optimizer_param.grad is None: optimizer_param.grad = torch.zeros_like(optimizer_param.data) else: optimizer_param = param.detach().clone().float() # Replace the optimizer params with the new fp32 copy. param_group['params'][i] = optimizer_param fp32_from_float16_params_this_group.append(optimizer_param) # Reset existing state dict key to the new optimizer param. if param in self.optimizer.state: self.optimizer.state[optimizer_param] = self.optimizer.state.pop(param) self.float16_groups.append(float16_params_this_group) self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group) # Leverage state_dict() and load_state_dict() to # recast preexisting per-param state tensors self.optimizer.load_state_dict(self.optimizer.state_dict()) def zero_grad(self, set_to_none=True): for group in self.float16_groups: _zero_grad_group(group, set_to_none) if not self.offload: for group in self.fp32_from_float16_groups: _zero_grad_group(group, set_to_none) def get_loss_scale(self): return self.grad_scaler.scale def _copy_model_grads_to_optimizer_grads(self): # This only needs to be done for the float16 group. for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups): for model_param, optimizer_param in zip(model_group, optimizer_group): if model_param.grad is not None: if self.offload: with torch.cuda.stream(self.gpu_to_cpu_stream): optimizer_param.grad.copy_(model_param.grad, non_blocking=False) else: optimizer_param.grad = model_param.grad.float() # Safe to deallocate model's grad/optimizer_grad after copying. # (If using contiguous buffers, optimizer_grad's memory should # persist and therefore should not be deallocated.) model_param.grad = None def _unscale_optimizer_grads_and_check_for_nan(self): optimizer_grads = [] # fp32 params fromm float16 ones. for optimizer_group in self.fp32_from_float16_groups: for optimizer_param in optimizer_group: if optimizer_param.grad is not None: optimizer_grads.append(optimizer_param.grad.data) # Reset found inf. self.found_inf.fill_(0.0) # Unscale and set found inf/nan print(optimizer_grads[0].device, self.found_inf.device, self.grad_scaler.inv_scale.device) if self.offload: self.found_inf = _has_overflow_serial(optimizer_grads) else: torch._amp_foreach_non_finite_check_and_unscale_(optimizer_grads, self.found_inf, self.grad_scaler.inv_scale) # Check for nan. found_inf_flag = (self.found_inf.item() > 0) return found_inf_flag def _get_model_and_optimizer_params_data_float16_deprecated(self): model_data = [] optimizer_data = [] for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups): for model_param, optimizer_param in zip(model_group, optimizer_group): model_data.append(model_param.data) optimizer_data.append(optimizer_param.data) return model_data, optimizer_data def _copy_optimizer_params_to_model_params(self): # Only needed for the float16 params. # model_data, optimizer_data = self._get_model_and_optimizer_params_data_float16_deprecated() # _multi_tensor_copy_this_to_that(this=optimizer_data, that=model_data) for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups): for model_param, optimizer_param in zip(model_group, optimizer_group): if self.offload: with torch.cuda.stream(self.cpu_to_gpu_stream): model_param.data.copy_(optimizer_param.data, non_blocking=False) else: model_param.data.copy_(optimizer_param.data) def _copy_model_params_to_optimizer_params(self): # Only needed for the float16 params. # model_data, optimizer_data = self._get_model_and_optimizer_params_data_float16_deprecated() # _multi_tensor_copy_this_to_that(this=model_data, that=optimizer_data) for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups): for model_param, optimizer_param in zip(model_group, optimizer_group): if self.offload: with torch.cuda.stream(self.gpu_to_cpu_stream): optimizer_param.data.copy_(model_param.data, non_blocking=False) else: optimizer_param.data.copy_(model_param.data) def reload_model_params(self): self._copy_model_params_to_optimizer_params() @torch.no_grad() def step(self): self._copy_model_grads_to_optimizer_grads() found_inf_flag = self._unscale_optimizer_grads_and_check_for_nan() self.grad_scaler.update(found_inf_flag) # If we found inf/nan, skip the update. if found_inf_flag: print("!!! Warning: find inf in fp16 optimizer-step() !!!") return False for params in self.fp32_from_float16_groups: torch.nn.utils.clip_grad_norm_(params, 1.0) # Step the optimizer. self.optimizer.step() self._copy_optimizer_params_to_model_params() # Successful update. return True def scale(self, z): return z * self.grad_scaler.scale def unscale(self, z): return z * self.grad_scaler.inv_scale def state_dict(self): return self.optimizer.state_dict() def load_state_dict(self, state_dict): self.optimizer.load_state_dict(state_dict) def get_fp16_optimizer(args, optimizer, device): assert args.fp16 is not None if args.loss_scale: print("fp16 uses ConstantGradScaler.") grad_scaler = ConstantGradScaler(args.loss_scale) else: print("fp16 uses DynamicGradScaler.") grad_scaler = DynamicGradScaler( initial_scale=args.initial_loss_scale, min_scale=args.min_loss_scale, growth_factor=2.0, backoff_factor=0.5, growth_interval=args.loss_scale_window, hysteresis=args.hysteresis) return Fp16Optimizer(optimizer, grad_scaler, device, getattr(args, 'use_offload', False)) ================================================ FILE: training/pipeline_parallel/__init__.py ================================================ ================================================ FILE: training/pipeline_parallel/dist_gpipe_pipeline_async.py ================================================ import time import json import torch.nn.functional from torch import optim from comm.comm_utils import * from modules.dist_gpt_pp_module import * from utils.logging_utils import * from data_parallel.dist_dp_utils import get_dp_module from optimizer.optimizer import get_fp16_optimizer import os import cupy from transformers import get_linear_schedule_with_warmup flag_profile = int(os.environ.get('FLAG_BENCHMARK', '0')) def get_parameter_names(model, forbidden_layer_types): """ Returns the names of the model parameters that are not inside a forbidden layer. """ result = [] for name, child in model.named_children(): result += [ f"{name}.{n}" for n in get_parameter_names(child, forbidden_layer_types) if not isinstance(child, tuple(forbidden_layer_types)) ] # Add model specific parameters (defined with nn.Parameter) since they are not in any child. result += list(model._parameters.keys()) return result def create_optimizer(model, optimizer_type, weight_decay=0.01, learning_rate=2e-5, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-6): if optimizer_type == 'adamw' or optimizer_type == 'adam': from torch.optim import AdamW print('>>>>> using Adam') elif optimizer_type == '8bit-adam': from bitsandbytes.optim import Adam8bit as AdamW print('>>>>> using 8bit-Adam') else: assert False decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm]) decay_parameters = [ name for name in decay_parameters if "bias" not in name] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if n in decay_parameters and p.requires_grad], "weight_decay": weight_decay, }, { "params": [p for n, p in model.named_parameters() if n not in decay_parameters and p.requires_grad], "weight_decay": 0.0, } ] optimizer_cls = AdamW optimizer_kwargs = { "betas": (adam_beta1, adam_beta2), "eps": adam_epsilon, } optimizer_kwargs["lr"] = learning_rate optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) return optimizer class GpipeAsync: r""" Async implementation of Gpipe. The current implementation leave the computation on the PyTorch default stream and the communication on a different stream, there is: a group of events to check if recv (from rank i-1) finishes in the forward propagation; a group of events to check if recv (from rank i+1) finishes in the backward propagation; a group of events to check if computation finishes in the forward propagation; a group of events to check if computation finishes in the backward propagation. """ def __init__(self, args, config, device, use_dp=False, _StageFull=GPTStageFull, _StageFirst=GPTStageFirst, _StageLast=GPTStageLast, _StageMiddle=GPTStageMiddle): print("=======Initialize Gpipe.") if args.fp16: self.use_fp16 = True self.use_dynamic_scale = (args.loss_scale == 0) print("=======Gpipe use FP16") else: self.use_fp16 = False print("=======Gpipe use FP32") self.use_dp = use_dp self.dtype = torch.float16 if self.use_fp16 else torch.float32 self.global_rank = args.rank self.pipeline_group_size = args.pipeline_group_size # Rank is the pipeline rank by default. self.pp_rank = get_pipeline_parallel_rank() if use_dp: self.dp_rank = get_data_parallel_rank() else: self.dp_rank = 0 self.pre_node_rank = self.pp_rank - 1 self.post_node_rank = self.pp_rank + \ 1 if self.pp_rank != self.pipeline_group_size - 1 else -1 self.comm = get_pipeline_parallel_comm() self.gradient_accumulate_step = args.gradient_accumulate_step print("=======Gradient accumulate step: ", self.gradient_accumulate_step) assert (args.batch_size % args.micro_batch_size == 0) self.micro_batch_num = args.batch_size // args.micro_batch_size self.micro_batch_size = args.micro_batch_size self.seq_length = args.seq_length self.embedding_dim = args.embedding_dim self.config = config self.vocab_size = config.vocab_size self.num_classes = config.num_labels self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') self.device = device self.torch_comp_stream = torch.cuda.default_stream(device=device) self.torch_recv_stream = torch.cuda.Stream(device=device, priority=-1) self.torch_send_stream = torch.cuda.Stream(device=device, priority=-1) self.forward_recv_ready_events = [torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) for _ in range(self.micro_batch_num)] self.forward_comp_ready_events = [torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) for _ in range(self.micro_batch_num)] self.backward_recv_ready_events = [torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) for _ in range(self.micro_batch_num)] self.backward_comp_ready_events = [torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) for _ in range(self.micro_batch_num)] if self.enable_tidy_profiling: self.profiling_log = [] self.forward_recv_start_events = [torch.cuda.Event(enable_timing=True, blocking=False) for _ in range(self.micro_batch_num)] self.forward_comp_start_events = [torch.cuda.Event(enable_timing=True, blocking=False) for _ in range(self.micro_batch_num)] self.forward_send_start_events = [torch.cuda.Event(enable_timing=True, blocking=False) for _ in range(self.micro_batch_num)] self.forward_send_end_events = [torch.cuda.Event(enable_timing=True, blocking=False) for _ in range(self.micro_batch_num)] self.backward_recv_start_events = [torch.cuda.Event(enable_timing=True, blocking=False) for _ in range(self.micro_batch_num)] self.backward_comp_start_events = [torch.cuda.Event(enable_timing=True, blocking=False) for _ in range(self.micro_batch_num)] self.backward_send_start_events = [torch.cuda.Event(enable_timing=True, blocking=False) for _ in range(self.micro_batch_num)] self.backward_send_end_events = [torch.cuda.Event(enable_timing=True, blocking=False) for _ in range(self.micro_batch_num)] self.init_event = torch.cuda.Event( enable_timing=True, blocking=False) self.init_time_stamp = None self.optimizer_start_event = torch.cuda.Event( enable_timing=True, blocking=False) self.optimizer_end_event = torch.cuda.Event( enable_timing=True, blocking=False) self._compute_micro_batch_size() if hasattr(args, 'infer_only') and args.infer_only: do_train = False else: do_train = True if self.pp_rank == 0: self.input_micro_batches = None else: self.input_micro_batches = [ torch.zeros((self.micro_batch_size, self.seq_length, self.embedding_dim), requires_grad=do_train, device=self.device, dtype=self.dtype ) for _ in range(self.micro_batch_num) ] if do_train: if self.pp_rank == self.pipeline_group_size - 1: init_train_logger(args) if self.pp_rank == self.pipeline_group_size - 1: self.output_micro_batches_grad = None else: self.output_micro_batches_grad = [ torch.zeros((self.micro_batch_size, self.seq_length, self.embedding_dim), requires_grad=False, device=self.device, dtype=self.dtype ) for _ in range(self.micro_batch_num) ] if self.pipeline_group_size > 1: if self.pp_rank == 0: self.model = _StageFirst(args, config, device) elif self.pp_rank == self.pipeline_group_size - 1: self.model = _StageLast(args, config, device) else: self.model = _StageMiddle(args, config, device) else: self.model = _StageFull(args, config, device) if self.use_fp16: self.model.half() if do_train: if self.use_fp16: tmp_optimizer = create_optimizer( self.model, optimizer_type=getattr(args, 'optimizer', 'adamw'), learning_rate=args.lr) self.optimizer = get_fp16_optimizer( args, tmp_optimizer, device) self.scheduler = get_linear_schedule_with_warmup( tmp_optimizer, args.warmup_steps, args.total_steps, ) else: self.optimizer = create_optimizer( self.model, optimizer_type=getattr(args, 'optimizer', 'adamw'), learning_rate=args.lr) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, args.warmup_steps, args.total_steps, ) # Notice that if we use fp16, gradients are aggregated in fp16, this may not be the default in Megatron. if use_dp: self.dp_optim = get_dp_module( args, device, self.model, self.optimizer) self.global_step = 0 def _compute_micro_batch_size(self): micro_batch_float_num = self.micro_batch_size * \ self.seq_length * self.embedding_dim if self.use_fp16: print("=======Current micro-batch send/recv size: {} MB (fp16)" .format(micro_batch_float_num * 2 // 1024 // 1024)) else: print("=======Current micro-batch send/recv size: {} MB (fp32)" .format(micro_batch_float_num*4//1024//1024)) print("=======Number of micro-batches: {}.".format(self.micro_batch_num)) def zero_input_grad(self): if self.input_micro_batches: for input_micro_batch in self.input_micro_batches: if input_micro_batch.grad is not None: input_micro_batch.grad.zero_() def profile_mark_forward_comp_start(self, i): if self.enable_tidy_profiling: self.torch_comp_stream.record_event( self.forward_comp_start_events[i]) def profile_mark_forward_recv_start(self, i): if self.enable_tidy_profiling: self.torch_recv_stream.record_event( self.forward_recv_start_events[i]) def profile_mark_forward_send_start(self, i): if self.enable_tidy_profiling: self.torch_send_stream.record_event( self.forward_send_start_events[i]) def profile_mark_forward_send_end(self, i): if self.enable_tidy_profiling: self.torch_send_stream.record_event( self.forward_send_end_events[i]) def profile_mark_backward_comp_start(self, i): if self.enable_tidy_profiling: self.torch_comp_stream.record_event( self.backward_comp_start_events[i]) def profile_mark_backward_recv_start(self, i): if self.enable_tidy_profiling: self.torch_recv_stream.record_event( self.backward_recv_start_events[i]) def profile_mark_backward_send_start(self, i): if self.enable_tidy_profiling: self.torch_send_stream.record_event( self.backward_send_start_events[i]) def profile_mark_backward_send_end(self, i): if self.enable_tidy_profiling: self.torch_send_stream.record_event( self.backward_send_end_events[i]) def get_ts(self, event): return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 def forward_stage(self, input_data=None, aux_input_data=None): # print("Forward stage start! rank-", self.rank) if aux_input_data is not None: for k in aux_input_data: aux_input_data[k] = torch.chunk( aux_input_data[k], self.micro_batch_num, dim=0) else: aux_input_data = {} if self.pp_rank == 0: assert(input_data is not None) self.input_micro_batches = torch.chunk( input_data, self.micro_batch_num, dim=0) if self.pp_rank == self.pipeline_group_size - 1: if input_data is not None: input_ids_micro_batches = torch.chunk( input_data, self.micro_batch_num, dim=0) else: input_ids_micro_batches = [None]*self.micro_batch_num output_micro_batches = [] for i in range(self.micro_batch_num): if self.pipeline_group_size > 1: if self.pp_rank == 0: # Only send output to next node, do not receive with torch.cuda.stream(self.torch_comp_stream): self.profile_mark_forward_comp_start(i) current_micro_output = self.model( self.input_micro_batches[i], **{k: v[i] for k, v in aux_input_data.items()} ) self.torch_comp_stream.record_event( self.forward_comp_ready_events[i]) with torch.cuda.stream(self.torch_send_stream): cupy_send_stream = cupy.cuda.ExternalStream( self.torch_send_stream.cuda_stream) self.torch_send_stream.wait_event( self.forward_comp_ready_events[i]) self.profile_mark_forward_send_start(i) self.comm.send(current_micro_output.data, dst=self.post_node_rank, stream=cupy_send_stream) self.profile_mark_forward_send_end(i) elif self.pp_rank == self.pipeline_group_size - 1: # Only receive input from last node, do not send with torch.cuda.stream(self.torch_recv_stream): cupy_recv_stream = cupy.cuda.ExternalStream( self.torch_recv_stream.cuda_stream) self.profile_mark_forward_recv_start(i) self.comm.recv( self.input_micro_batches[i], src=self.pre_node_rank, stream=cupy_recv_stream) self.torch_recv_stream.record_event( self.forward_recv_ready_events[i]) with torch.cuda.stream(self.torch_comp_stream): self.torch_comp_stream.wait_event( self.forward_recv_ready_events[i]) self.profile_mark_forward_comp_start(i) current_micro_output = self.model( self.input_micro_batches[i], input_ids=input_ids_micro_batches[i], **{k: v[i] for k, v in aux_input_data.items()} ) self.torch_comp_stream.record_event( self.forward_comp_ready_events[i]) else: # receive, compute, and send with torch.cuda.stream(self.torch_recv_stream): cupy_recv_stream = cupy.cuda.ExternalStream( self.torch_recv_stream.cuda_stream) self.profile_mark_forward_recv_start(i) self.comm.recv( self.input_micro_batches[i], src=self.pre_node_rank, stream=cupy_recv_stream) self.torch_recv_stream.record_event( self.forward_recv_ready_events[i]) with torch.cuda.stream(self.torch_comp_stream): self.torch_comp_stream.wait_event( self.forward_recv_ready_events[i]) self.profile_mark_forward_comp_start(i) current_micro_output = self.model( self.input_micro_batches[i], **{k: v[i] for k, v in aux_input_data.items()} ) self.torch_comp_stream.record_event( self.forward_comp_ready_events[i]) with torch.cuda.stream(self.torch_send_stream): cupy_send_stream = cupy.cuda.ExternalStream( self.torch_send_stream.cuda_stream) self.torch_send_stream.wait_event( self.forward_comp_ready_events[i]) self.profile_mark_forward_send_start(i) self.comm.send(current_micro_output.data, dst=self.post_node_rank, stream=cupy_send_stream) self.profile_mark_forward_send_end(i) else: with torch.cuda.stream(self.torch_comp_stream): self.profile_mark_forward_comp_start(i) current_micro_output = self.model( self.input_micro_batches[i], **{k: v[i] for k, v in aux_input_data.items()} ) self.torch_comp_stream.record_event( self.forward_comp_ready_events[i]) output_micro_batches.append(current_micro_output) return output_micro_batches def profiling_forward_stage(self): torch.cuda.synchronize() for i in range(self.micro_batch_num): if self.pp_rank != 0: recv_slot = self.forward_recv_start_events[i].elapsed_time( self.forward_recv_ready_events[i]) * 1e+3 recv_log = {"name": "recv", "ph": "X", "pid": self.global_rank, "tid": "1. forward-recv", "ts": self.get_ts(self.forward_recv_start_events[i]), "dur": recv_slot, "args": {"micro-batch": i}, "cname": "startup"} # cname is for color, a little silly. # print(recv_log) self.profiling_log.append(recv_log) comp_slot = self.forward_comp_start_events[i].elapsed_time( self.forward_comp_ready_events[i]) * 1e+3 comp_log = {"name": "comp", "ph": "X", "pid": self.global_rank, "tid": "2. forward-compute", "ts": self.get_ts(self.forward_comp_start_events[i]), "dur": comp_slot, "args": {"micro-batch": i}, "cname": "good"} # print(comp_log) self.profiling_log.append(comp_log) if self.pp_rank != self.pipeline_group_size - 1: send_slot = self.forward_send_start_events[i].elapsed_time( self.forward_send_end_events[i]) * 1e+3 send_log = {"name": "send", "ph": "X", "pid": self.global_rank, "tid": "3. forward-send", "ts": self.get_ts(self.forward_send_start_events[i]), "dur": send_slot, "args": {"micro-batch": i}, "cname": "thread_state_iowait"} # print(send_log) self.profiling_log.append(send_log) def backward_stage(self, cached_output_micro_batches: List[torch.Tensor], target=None, loss_func=torch.nn.functional.cross_entropy): # print("Backward stage start! rank-", self.rank) if self.pp_rank == self.pipeline_group_size - 1: assert(target is not None) target_as_micro_batches = torch.chunk( target, self.micro_batch_num, dim=0) # else: # assert(target is None) if self.pp_rank == self.pipeline_group_size - 1: tr_loss = [] for i in range(self.micro_batch_num): if self.pipeline_group_size > 1: if self.pp_rank == self.pipeline_group_size - 1: # only send grad back to last node, do not receive with torch.cuda.stream(self.torch_comp_stream) as st: self.profile_mark_backward_comp_start(i) loss = loss_func( input=cached_output_micro_batches[i], target=target_as_micro_batches[i]) if not flag_profile: tr_loss.append(loss.item()) if self.use_fp16: self.optimizer.scale(loss).backward() else: loss.backward() self.torch_comp_stream.record_event( self.backward_comp_ready_events[i]) with torch.cuda.stream(self.torch_send_stream): cupy_send_stream = cupy.cuda.ExternalStream( self.torch_send_stream.cuda_stream) self.torch_send_stream.wait_event( self.backward_comp_ready_events[i]) self.profile_mark_backward_send_start(i) self.comm.send( self.input_micro_batches[i].grad, dst=self.pre_node_rank, stream=cupy_send_stream) self.profile_mark_backward_send_end(i) elif self.pp_rank == 0: # only receive grad from previous node, do not send with torch.cuda.stream(self.torch_recv_stream): cupy_recv_stream = cupy.cuda.ExternalStream( self.torch_recv_stream.cuda_stream) self.profile_mark_backward_recv_start(i) self.comm.recv( self.output_micro_batches_grad[i], src=self.post_node_rank, stream=cupy_recv_stream) self.torch_recv_stream.record_event( self.backward_recv_ready_events[i]) with torch.cuda.stream(self.torch_comp_stream): self.torch_comp_stream.wait_event( self.backward_recv_ready_events[i]) self.profile_mark_backward_comp_start(i) cached_output_micro_batches[i].backward( gradient=self.output_micro_batches_grad[i]) self.torch_comp_stream.record_event( self.backward_comp_ready_events[i]) else: # receive, compute and send with torch.cuda.stream(self.torch_recv_stream): cupy_recv_stream = cupy.cuda.ExternalStream( self.torch_recv_stream.cuda_stream) self.profile_mark_backward_recv_start(i) self.comm.recv( self.output_micro_batches_grad[i], src=self.post_node_rank, stream=cupy_recv_stream) self.torch_recv_stream.record_event( self.backward_recv_ready_events[i]) with torch.cuda.stream(self.torch_comp_stream): self.torch_comp_stream.wait_event( self.backward_recv_ready_events[i]) self.profile_mark_backward_comp_start(i) cached_output_micro_batches[i].backward( gradient=self.output_micro_batches_grad[i]) self.torch_comp_stream.record_event( self.backward_comp_ready_events[i]) with torch.cuda.stream(self.torch_send_stream): cupy_send_stream = cupy.cuda.ExternalStream( self.torch_send_stream.cuda_stream) self.torch_send_stream.wait_event( self.backward_comp_ready_events[i]) self.profile_mark_backward_send_start(i) self.comm.send( self.input_micro_batches[i].grad, dst=self.pre_node_rank, stream=cupy_send_stream) self.profile_mark_backward_send_end(i) else: with torch.cuda.stream(self.torch_comp_stream) as st: self.profile_mark_backward_comp_start(i) loss = loss_func( input=cached_output_micro_batches[i], target=target_as_micro_batches[i]) if not flag_profile: tr_loss.append(loss.item()) if self.use_fp16: self.optimizer.scale(loss).backward() else: loss.backward() self.torch_comp_stream.record_event( self.backward_comp_ready_events[i]) if not flag_profile: if self.pp_rank == self.pipeline_group_size - 1: train_log( { 'loss': sum(tr_loss)/len(tr_loss), 'lr': self.scheduler.get_last_lr()[0], }, step=self.global_step, ) def profiling_backward_stage(self): torch.cuda.synchronize() for i in range(self.micro_batch_num): if self.pp_rank != self.pipeline_group_size - 1: recv_slot = self.backward_recv_start_events[i].elapsed_time( self.backward_recv_ready_events[i]) * 1e+3 recv_log = {"name": "recv", "ph": "X", "pid": self.global_rank, "tid": "4. backward-recv", "ts": self.get_ts(self.backward_recv_start_events[i]), "dur": recv_slot, "args": {"micro-batch": i}, "cname": "startup"} # print(recv_log) self.profiling_log.append(recv_log) comp_slot = self.backward_comp_start_events[i].elapsed_time( self.backward_comp_ready_events[i]) * 1e+3 comp_log = {"name": "comp", "ph": "X", "pid": self.global_rank, "tid": "5. backward-compute", "ts": self.get_ts(self.backward_comp_start_events[i]), "dur": comp_slot, "args": {"micro-batch": i}, "cname": "good"} # print(comp_log) self.profiling_log.append(comp_log) if self.pp_rank != 0: send_slot = self.backward_send_start_events[i].elapsed_time( self.backward_send_end_events[i]) * 1e+3 send_log = {"name": "send", "ph": "X", "pid": self.global_rank, "tid": "6. backward-send", "ts": self.get_ts(self.backward_send_start_events[i]), "dur": send_slot, "args": {"micro-batch": i}, "cname": "thread_state_iowait"} # print(send_log) self.profiling_log.append(send_log) def save_on_disk(self, path): os.makedirs(path, exist_ok=True) torch.save(self.model.state_dict(), os.path.join(path, 'pytorch_model.bin')) def optimizer_step(self): # hard code: grad clipping if not self.use_fp16: torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) if self.use_dp: with torch.cuda.stream(self.torch_comp_stream): self.torch_comp_stream.record_event( self.dp_optim.backward_ready_event) self.dp_optim.optimizer_step() self.scheduler.step() else: with torch.cuda.stream(self.torch_comp_stream): if self.enable_tidy_profiling: self.optimizer_start_event.record() self.optimizer.step() self.scheduler.step() if self.enable_tidy_profiling: self.optimizer_end_event.record() if self.enable_tidy_profiling: self.profiling_optimizer_step() def profiling_optimizer_step(self): torch.cuda.synchronize() if not self.use_dp: optimizer_slot = self.optimizer_start_event.elapsed_time( self.optimizer_end_event) * 1e+3 optimizer_log = {"name": "opt", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-step", "ts": self.get_ts(self.optimizer_start_event), "dur": optimizer_slot, "cname": "bad"} # print(optimizer_log) self.profiling_log.append(optimizer_log) else: self.profiling_log.extend(self.dp_optim.profiling_data_parallel( self.init_time_stamp, self.init_event)) def export_profiling_result(self, filename): with open(filename, 'w') as outfile: json.dump(self.profiling_log, outfile) def sgd_iter(self, input_=None, target=None, aux_input_data=None, loss_func=torch.nn.functional.cross_entropy): if self.use_fp16 and self.use_dynamic_scale: scales_buffer = [torch.ones_like(self.optimizer.grad_scaler._scale) for _ in range(self.pipeline_group_size)] self.comm.all_gather(self.optimizer.grad_scaler._scale, scales_buffer) self.optimizer.grad_scaler._scale.data[:] = min([s.item() for s in scales_buffer]) self.comm.barrier() start_time = time.time() if self.enable_tidy_profiling: torch.cuda.synchronize() self.init_time_stamp = time.time() * 1e+6 self.init_event.record() step = self.global_step % self.gradient_accumulate_step self.zero_input_grad() if step == 0: self.optimizer.zero_grad(set_to_none=False) if step == self.gradient_accumulate_step - 1 and self.use_dp: if hasattr(self.dp_optim, 'pre_optimizer_step'): self.dp_optim.pre_optimizer_step() outputs = self.forward_stage(input_, aux_input_data=aux_input_data) forward_time = time.time() forward_slot = forward_time-start_time print("Rank {} node forward pass {}/{} takes {:3.2f}s" .format(self.global_rank, step, self.gradient_accumulate_step, forward_slot)) # This is an educated guess that such barrier would make it fair TC (probably required) # self.comm.barrier() self.backward_stage(outputs, target, loss_func=loss_func) backward_time = time.time() print("Rank {} node backward pass {}/{} takes {:3.2f}s" .format(self.global_rank, step, self.gradient_accumulate_step, backward_time-forward_time)) if step == self.gradient_accumulate_step - 1: optimizer_time = time.time() self.optimizer_step() torch.cuda.synchronize() if self.enable_tidy_profiling: self.profiling_forward_stage() self.profiling_backward_stage() print('after cuda sync', self.global_rank) self.comm.barrier() end_time = time.time() print("Rank {} node optimizer step takes {:3.2f}s".format( self.global_rank, end_time - optimizer_time)) else: self.comm.barrier() end_time = time.time() iter_time = end_time - start_time print("Rank {} node whole iteration takes {:3.2f}s".format( self.global_rank, iter_time)) print("-------------------------------------------") # torch.cuda.empty_cache() # print(torch.cuda.memory_summary()) self.global_step += 1 return iter_time def infer_stage(self, input_data=None, aux_input_data=None, labels=None, pred_func=None): if aux_input_data is not None: for k in aux_input_data: aux_input_data[k] = torch.chunk(aux_input_data[k], self.micro_batch_num, dim=0) else: aux_input_data = {} if self.pp_rank == 0: assert(input_data is not None) self.input_micro_batches = torch.chunk(input_data, self.micro_batch_num, dim=0) if self.pp_rank == self.pipeline_group_size - 1: if input_data is not None: input_ids_micro_batches = torch.chunk(input_data, self.micro_batch_num, dim=0) else: input_ids_micro_batches = [None]*self.micro_batch_num if labels is not None: labels = torch.chunk(labels, self.micro_batch_num, dim=0) else: labels = [None]*self.micro_batch_num output_micro_batches = [] for i in range(self.micro_batch_num): if self.pipeline_group_size > 1: if self.pp_rank == 0: # Only send output to next node, do not receive with torch.cuda.stream(self.torch_comp_stream): current_micro_output = self.model( self.input_micro_batches[i], **{k: v[i] for k, v in aux_input_data.items()}, ) self.torch_comp_stream.record_event(self.forward_comp_ready_events[i]) with torch.cuda.stream(self.torch_send_stream): cupy_send_stream = cupy.cuda.ExternalStream(self.torch_send_stream.cuda_stream) self.torch_send_stream.wait_event(self.forward_comp_ready_events[i]) self.comm.send(current_micro_output.data, dst=self.post_node_rank, stream=cupy_send_stream) elif self.pp_rank == self.pipeline_group_size - 1: # Only receive input from last node, do not send with torch.cuda.stream(self.torch_recv_stream): cupy_recv_stream = cupy.cuda.ExternalStream(self.torch_recv_stream.cuda_stream) self.comm.recv(self.input_micro_batches[i], src=self.pre_node_rank, stream=cupy_recv_stream) self.torch_recv_stream.record_event(self.forward_recv_ready_events[i]) with torch.cuda.stream(self.torch_comp_stream): self.torch_comp_stream.wait_event(self.forward_recv_ready_events[i]) current_micro_output = self.model( self.input_micro_batches[i], input_ids=input_ids_micro_batches[i], **{k: v[i] for k, v in aux_input_data.items()}, ) current_micro_output = pred_func(current_micro_output, labels[i]) self.torch_comp_stream.record_event(self.forward_comp_ready_events[i]) else: # receive, compute, and send with torch.cuda.stream(self.torch_recv_stream): cupy_recv_stream = cupy.cuda.ExternalStream(self.torch_recv_stream.cuda_stream) self.comm.recv(self.input_micro_batches[i], src=self.pre_node_rank, stream=cupy_recv_stream) self.torch_recv_stream.record_event(self.forward_recv_ready_events[i]) with torch.cuda.stream(self.torch_comp_stream): self.torch_comp_stream.wait_event(self.forward_recv_ready_events[i]) current_micro_output = self.model( self.input_micro_batches[i], **{k: v[i] for k, v in aux_input_data.items()}, ) self.torch_comp_stream.record_event(self.forward_comp_ready_events[i]) with torch.cuda.stream(self.torch_send_stream): cupy_send_stream = cupy.cuda.ExternalStream(self.torch_send_stream.cuda_stream) self.torch_send_stream.wait_event(self.forward_comp_ready_events[i]) self.comm.send(current_micro_output.data, dst=self.post_node_rank, stream=cupy_send_stream) else: with torch.cuda.stream(self.torch_comp_stream): current_micro_output = self.model( self.input_micro_batches[i], **{k: v[i] for k, v in aux_input_data.items()} ) current_micro_output = pred_func(current_micro_output, labels[i]) self.torch_comp_stream.record_event( self.forward_comp_ready_events[i]) output_micro_batches.append(current_micro_output) return output_micro_batches def infer_iter(self, input_=None, target=None, output_=None, aux_input_data=None, pred_func=None): # self.comm.barrier() torch.cuda.synchronize() with torch.no_grad(): outputs = self.infer_stage(input_, aux_input_data=aux_input_data, labels=target, pred_func=pred_func) if output_ is not None: outputs = torch.cat(outputs, 0).mean().item() print(outputs) output_.append(outputs) torch.cuda.synchronize() # self.comm.barrier() ================================================ FILE: training/pipeline_parallel/dist_pp_utils.py ================================================ from .dist_gpipe_pipeline_async import GpipeAsync def get_pp_module(args, config, device, use_dp): if args.pp_mode == 'gpipe': return GpipeAsync(args, config, device, use_dp) else: print("Not recognize this pipeline parallel mode.") assert False ================================================ FILE: training/tasks/__init__.py ================================================ ================================================ FILE: training/tasks/data_loaders/__init__.py ================================================ ================================================ FILE: training/tasks/data_loaders/data_utils.py ================================================ import os import re import torch import json import numpy as np from torch.utils.data import IterableDataset, DataLoader from itertools import cycle, islice import random from datasets import Dataset from datasets import load_dataset, load_from_disk from comm.comm_utils import * from itertools import islice from random import randint SHOW_DATA = int(os.environ.get('SHOW_DATA', 1)) UL2R_DENOISE_ENABLED = int(os.environ.get('UL2R_DENOISE_ENABLED', 0)) import os import re import torch from torch.utils.data import IterableDataset, DataLoader from itertools import cycle, islice import random from datasets import Dataset from datasets import load_dataset, load_from_disk from comm.comm_utils import * def random_chunk(li, min_chunk=1, max_chunk=5): it = iter(li) while True: nxt = list(islice(it,randint(min_chunk,max_chunk))) if nxt: yield nxt else: break class UL2RProcessor: ''' This is a replication of UL2R from our understanding. We welcome PR if there are better implementations. ''' def __init__(self, tokenizer, seq_length=1024): self.tokenizer = tokenizer self.seq_length = seq_length self.s2s_prefix = self.tokenizer("[S2S]")['input_ids'] self.nlg_prefix = self.tokenizer("[NLG]")['input_ids'] self.nlu_prefix = self.tokenizer("[NLU]")['input_ids'] self.extra_ids = [self.tokenizer.eos_token_id - 100 + i for i in range(80)] def preprocess_tokens_s2s(self, tokens): tokens = self.s2s_prefix + tokens split = int(random.random() * len(tokens)) tokens = tokens[:split] + tokens[split:] tokens = tokens[:self.seq_length] prefix_masks = torch.zeros(len(tokens), dtype=torch.uint8) prefix_masks[:split] = 1 return { 'input_ids': torch.tensor(tokens), 'prefix_masks': prefix_masks, } def preprocess_tokens_nlg(self, tokens): tokens = tokens[:self.seq_length - len(self.nlg_prefix) - 2] start = int(random.random() * len(tokens)) end = start + 1 + int(random.random() * 31) left = self.nlg_prefix + tokens[:start] + [self.extra_ids[0]] + tokens[end:] right = [self.extra_ids[0]] + tokens[start:end] tokens = left + right tokens = tokens[:self.seq_length] tokens = tokens + (self.seq_length - len(tokens)) * [self.tokenizer.eos_token_id] prefix_masks = torch.zeros(len(tokens), dtype=torch.uint8) prefix_masks[:len(left)] = 1 return { 'input_ids': torch.tensor(tokens), 'prefix_masks': prefix_masks, } def preprocess_tokens_nlu(self, tokens): tokens = tokens[:self.seq_length - len(self.nlu_prefix) - 10] # split to chunks chunks = list(random_chunk(tokens, min_chunk=1, max_chunk=5)) # randomly select 15% K = int(0.15 * len(chunks)) indices = random.sample(range(len(chunks)), K) left = self.nlu_prefix right = [] extra_id_count = 0 last_corrupt = False for i, chunk in enumerate(chunks): # make sure not consecutive corrupt chunks if i in indices and not last_corrupt and extra_id_count < len(self.extra_ids): left += [self.extra_ids[extra_id_count]] right += [self.extra_ids[extra_id_count]] + chunk extra_id_count += 1 else: left += chunk last_corrupt = False tokens = left + right tokens = tokens[:self.seq_length] tokens = tokens + (self.seq_length - len(tokens)) * [self.tokenizer.eos_token_id] prefix_masks = torch.zeros(len(tokens), dtype=torch.uint8) prefix_masks[:len(left)] = 1 return { 'input_ids': torch.tensor(tokens), 'prefix_masks': prefix_masks, } def preprocess_ul2r(self, inputs): tokens = inputs['input_ids'].tolist() p = random.random() if p > 0.5: return self.preprocess_tokens_s2s(tokens) elif p > 0.25: return self.preprocess_tokens_nlg(tokens) else: return self.preprocess_tokens_nlu(tokens) def preprocess_random(self, inputs): tokens = inputs['input_ids'].tolist() if random.random() < 0.2: # short prompt split = int(random.random() * 20) else: # random length prompt split = int(random.random() * len(tokens)) tokens = tokens[:split] + tokens[split:] tokens = tokens[:self.seq_length] prefix_masks = torch.zeros(len(tokens), dtype=torch.uint8) prefix_masks[:split] = 1 return { 'input_ids': torch.tensor(tokens), 'prefix_masks': prefix_masks, } def __call__(self, inputs): if UL2R_DENOISE_ENABLED: return self.preprocess_ul2r(inputs) else: return self.preprocess_random(inputs) class StreamDataset(IterableDataset): default_doc_separator = '\n' def __init__(self, data, tokenizer, seq_length=1024, doc_separator=None, cycling=True): self.data = data self.tokenizer = tokenizer self.seq_length = seq_length self.doc_separator = doc_separator or StreamDataset.default_doc_separator self.cycling = cycling self.it = None self.iter_count = 0 self.buffer_tokens = [] def state_dict(self): return {} def load_state_dict(self, state_dict): pass def get_sequence(self): buffer_tokens = self.buffer_tokens for x in self.data: self.iter_count += 1 curr_tokens = self.tokenizer(self.doc_separator + x['text'])['input_ids'] buffer_tokens += curr_tokens while len(buffer_tokens) >= self.seq_length: tokens = buffer_tokens[:self.seq_length] buffer_tokens = buffer_tokens[self.seq_length:] input_ids = torch.tensor(tokens) self.buffer_tokens = buffer_tokens # update for restore yield { 'input_ids': input_ids, } def get_stream(self): if self.cycling: return cycle(self.get_sequence()) else: return self.get_sequence() def __iter__(self): if self.it is None: self.it = self.get_stream() return self.it class StreamDatasetList(IterableDataset): def __init__(self, task_names, datasets, sample_probs, tokenizer, seq_length=1024, print_sample_every_n=64, post_processor=None): self.task_names = task_names self.datasets = datasets self.sample_probs = sample_probs self.tokenizer = tokenizer self.seq_length = seq_length self.print_sample_every_n = print_sample_every_n self.post_processor = post_processor self.token_count = None self.it = None def state_dict(self): return {} def load_state_dict(self, state_dict): pass def get_sequence(self): iterators = [cycle(d.get_sequence()) for d in self.datasets] prob_ths = np.cumsum([p / sum(self.sample_probs) for p in self.sample_probs]) global_i = 0 while True: p = random.random() for task_name, it, th in zip(self.task_names, iterators, prob_ths): if p < th: inputs = next(it) if self.post_processor is not None: inputs = self.post_processor(inputs) if SHOW_DATA: if global_i % self.print_sample_every_n == 0: print(p, th) print(f"**{task_name}**:", self.tokenizer.decode(inputs['input_ids'])) yield inputs global_i += 1 break def get_stream(self): return cycle(self.get_sequence()) def __iter__(self): if self.it is None: self.it = self.get_stream() return self.it def tokenize_function(self, examples): # Update here output = self.tokenizer( examples["text"], padding=False, truncation=True, max_length=self.tokenizer.model_max_length, ) return output # Compute the number of tokens in a dataset using a Torch tokenizer # - return: the sum of tokens from the the text field of each sample in the dataset def get_dataset_token_count(self) -> int: if self.token_count is not None: return self.token_count self.token_count = 0 if self.task_names is None: return self.token_count raw_datasets = load_dataset( "json", data_files=self.task_names, split="train", ) column_names = list(raw_datasets.features) tokenized_datasets = raw_datasets.map( self.tokenize_function, batched=True, remove_columns=column_names, desc="Running tokenizer on dataset", ) for item in tokenized_datasets: self.token_count += len(item['input_ids']) return self.token_count def get_dataset_example_count(self) -> int: num_lines = 0 if self.task_names is None: return num_lines for jsonl_file in self.task_names: with open(jsonl_file, "r") as file: for line in file: if line.replace(" ", "") != "\n": num_lines += 1 return num_lines def name_to_dataset(task, tokenizer, args): if 'prosocial_plus_regular.jsonl' in task: from .prosocial import StreamDataset as _StreamDataset data = load_dataset("json", data_files=task, split="train", streaming=True).shuffle(buffer_size=100_000, seed=args.seed) dataset = _StreamDataset(data, tokenizer, args.seq_length) elif task != '': data = load_dataset("json", data_files=task, split="train", streaming=True).shuffle(buffer_size=100_000, seed=args.seed) dataset = StreamDataset(data, tokenizer, args.seq_length) else: raise Exception('One of the provided datasets is an empty string.') return dataset def name_to_dataset_eval(task, tokenizer, args): if task != '': data = load_dataset("json", data_files=task, split="train", streaming=True) dataset = StreamDataset(data, tokenizer, args.seq_length, cycling=False) return dataset def get_train_data_loader(args, tokenizer, num_workers=1, state_dict=None): task_list = args.task_name.split(',') task_names = [] datasets = [] probs = [] print('data_utils: parse task_list') for task in task_list: if task.startswith('http'): # data from url has an addtional : if len(task.split(':')) == 3: prefix, task, prob = task.strip().split(':') task = f"{prefix}:{task}" prob = float(prob) elif len(task.split(':')) == 2: task = task.strip() prob = 1.0 else: raise Exception('Cannot parse task.') elif ':' in task: task, prob = task.strip().split(':') prob = float(prob) else: task = task.strip() prob = 1.0 dataset = name_to_dataset(task, tokenizer, args) print('data_utils:', task, prob) task_names.append(task) datasets.append(dataset) probs.append(prob) stream_dataset = StreamDatasetList( task_names, datasets, probs, tokenizer=tokenizer, seq_length=args.seq_length) if state_dict is not None: stream_dataset.load_state_dict(state_dict) train_data_loader = torch.utils.data.DataLoader(stream_dataset, batch_size=args.batch_size * args.data_group_size, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=None) print('data_utils: get train_data_loader') return train_data_loader def get_eval_data_loader(args, tokenizer, num_workers=1, state_dict=None): task_list = args.task_name.split(',') task_names = [] datasets = [] probs = [] print('data_utils: parse task_list') evaluation_data = args.evaluation_data if evaluation_data is None: return None dataset = name_to_dataset_eval(evaluation_data, tokenizer, args) train_data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, drop_last=True, num_workers=num_workers, pin_memory=True, collate_fn=None) return train_data_loader def get_ul2r_train_data_loader(args, tokenizer, num_workers=1, state_dict=None): task_list = args.task_name.split(',') task_names = [] datasets = [] probs = [] for task in task_list: if ':' in task: task, prob = task.strip().split(':') prob = float(prob) else: task = task.strip() prob = 1.0 dataset = name_to_dataset(task, tokenizer, args) task_names.append(task) datasets.append(dataset) probs.append(prob) ul2r_processor = UL2RProcessor(tokenizer, seq_length=args.seq_length) stream_dataset = StreamDatasetList( task_names, datasets, probs, tokenizer=tokenizer, seq_length=args.seq_length, post_processor=ul2r_processor) if state_dict is not None: stream_dataset.load_state_dict(state_dict) train_data_loader = torch.utils.data.DataLoader(stream_dataset, batch_size=args.batch_size * args.data_group_size, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=None) print('ul2r dataloader init done.') return train_data_loader ================================================ FILE: training/tasks/data_loaders/prosocial.py ================================================ import os import re import torch import json from torch.utils.data import IterableDataset, DataLoader from itertools import cycle, islice import random from datasets import Dataset from datasets import load_dataset, load_from_disk from comm.comm_utils import * class StreamDataset(IterableDataset): def __init__(self, dataset, tokenizer, seq_length=1024): self.dataset = dataset self.tokenizer = tokenizer self.seq_length = seq_length self.it = None self.iter_count = 0 def state_dict(self): return { 'iter_count': self.iter_count, } def load_state_dict(self, state_dict): self.iter_count = state_dict['iter_count'] self.dataset = self.dataset.skip(self.iter_count) def get_sequence(self): it = cycle(iter(self.dataset)) while True: text_context = '''Possible labels: 1. casual 2. needs caution 3. needs intervention 4. possibly needs caution 5. probably needs caution''' while True: instance = next(it) text = instance['text'] text_context += '\n\n' + text input_ids = self.tokenizer(text_context.strip())['input_ids'] if len(input_ids) > self.seq_length: break input_ids = input_ids[:self.seq_length] input_ids = torch.tensor(input_ids).long() yield { 'input_ids': input_ids, } def get_stream(self): return cycle(self.get_sequence()) def __iter__(self): if self.it is None: self.it = self.get_stream() return self.it ================================================ FILE: training/utils/__init__.py ================================================ ================================================ FILE: training/utils/dist_args_utils.py ================================================ def add_device_arguments(parser): parser.add_argument('--use-cuda', default=True, type=lambda x: (str(x).lower() == 'true'), help='if this is set to True, will use cuda to train') parser.add_argument('--cuda-id', type=int, default=0, metavar='N', help='cuda index, if the instance has multiple GPUs.') parser.add_argument('--cuda-num', type=int, default=1, metavar='N', help='number of GPUs, if the instance has multiple GPUs.') parser.add_argument('--debug-mem', default=True, type=lambda x: (str(x).lower() == 'true'), help='if this is set to True, we will print some memory stats.') def add_torch_distributed_arguments(parser): parser.add_argument('--dist-backend', type=str, default='cupy_nccl', metavar='S', help='backend type for distributed PyTorch (default: cupy_nccl)') parser.add_argument('--dp-backend', type=str, default='nccl', metavar='S', help='backend type for data parallel') parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:9000', metavar='S', help='master ip for distributed PyTorch') parser.add_argument('--world-size', type=int, default=4, metavar='D', help='world-size (default: 4)') parser.add_argument('--pipeline-group-size', type=int, default=4, metavar='D', help='world-size (default: 2)') parser.add_argument('--data-group-size', type=int, default=1, metavar='D', help='world-size (default: 1)') parser.add_argument('--rank', type=int, default=0, metavar='N', help='rank of the node') def add_task_arguments(parser): parser.add_argument('--train-data', nargs='+', default=['./glue_dataset/data/QQP/train.tsv'], metavar='S', help='path to the training data') parser.add_argument('--valid-data', nargs='+', default=['./glue_dataset/data/QQP/test.tsv'], metavar='S', help='path to the training data') parser.add_argument('--tokenizer-type', type=str, default='BertWordPieceLowerCase', metavar='S', help='which tokenizer to use.') parser.add_argument('--vocab-file', type=str, default='./glue_dataset/data/bert-large-cased-vocab.txt', metavar='S', help='which tokenizer to use.') parser.add_argument('--vocab-extra-ids', type=int, default=0, metavar='N', help='-') parser.add_argument('--make-vocab-size-divisible-by', type=int, default=128, metavar='N', help='-') parser.add_argument('--optimizer', type=str, default='adamw', metavar='N', help='-') def add_model_arguments(parser): parser.add_argument('--seq-length', type=int, default=1024, metavar='N', help='-') parser.add_argument('--embedding-dim', type=int, default=768, metavar='N', help='-') parser.add_argument('--num-layers', type=int, default=4, metavar='N', help='-') parser.add_argument('--num-heads', type=int, default=12, metavar='N', help='-') def add_training_hyper_parameter_arguments(parser): parser.add_argument('--train-log-backend', type=str, default='print', metavar='N', help='-') parser.add_argument('--project-name', type=str, default='test', metavar='N', help='-') parser.add_argument('--batch-size', type=int, default=32, metavar='N', help='input batch size for training (default: 100)') parser.add_argument('--micro-batch-size', type=int, default=8, metavar='N', help='input micro batch size for training (default: 100)') parser.add_argument('--lr', type=float, default=0.01, metavar='N', help='-') parser.add_argument('--num-iters', type=int, default=10, metavar='N', help='-') def add_mixed_precision_arguments(parser): parser.add_argument('--fp16', action='store_true', help='Run model in fp16 mode.') parser.add_argument('--loss-scale', type=float, default=0, help='Static loss scaling, positive power of 2 values can improve fp16 convergence. ') parser.add_argument('--initial-loss-scale', type=float, default=32768, help='Initial loss-scale for dynamic loss scaling.') parser.add_argument('--min-loss-scale', type=float, default=1.0, help='Minimum loss scale for dynamic loss scale.') parser.add_argument('--loss-scale-window', type=float, default=1000, help='Window over which to raise/lower dynamic scale.') parser.add_argument('--hysteresis', type=int, default=2, help='hysteresis for dynamic loss scaling') parser.add_argument('--use-offload', action='store_true', help='Offload optim states to CPU') def add_parallel_schema_arguments(parser): parser.add_argument('--pp-mode', type=str, default='gpipe', metavar='S', help='use which pipeline parallel mode: gpipe or 1f1b.') parser.add_argument('--dp-mode', type=str, default='allreduce', metavar='S', help='use which data parallel mode: allreduce.') parser.add_argument('--gradient-accumulate-step', type=int, default=1, help='Number of gradient computation in Pipeline without data parallel sync.') def get_model_arguments_str(args): return '_l' + str(args.seq_length) + '_m' + str(args.embedding_dim) def get_dist_arguments_str(args, add_rank=True): dist_str = '_w' + str(args.world_size) + '_p' + str(args.pipeline_group_size) + "_" + \ str(args.gradient_accumulate_step) + '_d' + str(args.data_group_size) if add_rank: dist_str = dist_str + '_' + str(args.rank) return dist_str def get_learning_arguments_str(args): return '_b' + str(args.batch_size) + '_' + str(args.micro_batch_size) def get_mixed_precision_arguments_str(args): if args.fp16: return '_fp16' else: return '' ================================================ FILE: training/utils/dist_checkpoint_utils.py ================================================ import os import time import random import json import numpy as np import torch from comm.comm_utils import * def load_checkpoint(pipe, args): if os.path.isfile(os.path.join(args.checkpoint_path, 'latest')): with open(os.path.join(args.checkpoint_path, 'latest')) as f: latest_step = int(f.read()) else: print('no checkpoint available, skipping') return checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}") try: with open(os.path.join(checkpoint_step_path, 'meta.json')) as f: meta = json.load(f) except: print('failed to load meta.') pipe.global_step = latest_step try: pipe.model.model.load_state_dict( torch.load( os.path.join( checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_checkpoint.pt' ), map_location=torch.device('cpu') ) ) except: print('failed to load model params.') try: pipe.optimizer.load_state_dict( torch.load( os.path.join( checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_optimizer.pt' ), map_location=torch.device('cpu') ) ) except: print('failed to load optim states.') try: pipe.scheduler.load_state_dict( torch.load( os.path.join( checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_scheduler.pt' ) ) ) except: print('failed to load scheduler states.') def save_checkpoint(pipe, args) -> str: latest_step = pipe.global_step checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}") os.makedirs(checkpoint_step_path, exist_ok=True) print(f"Saving checkpoint to {checkpoint_step_path} ...") torch.save( pipe.model.model.state_dict(), os.path.join( checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_checkpoint.pt' ) ) torch.save( pipe.optimizer.state_dict(), os.path.join( checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_optimizer.pt' ) ) torch.save( pipe.scheduler.state_dict(), os.path.join( checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_scheduler.pt' ) ) with open(os.path.join(checkpoint_step_path, 'meta.json'), 'w') as f: json.dump({ 'step': latest_step, }, f) with open(os.path.join(args.checkpoint_path, 'latest'), 'w') as f: f.write(f"{latest_step}") print(f"Checkpoint saved to {checkpoint_step_path} ... Done") return checkpoint_step_path def save_stream_dataloader_state_dict(dataloader, pipe, args): latest_step = pipe.global_step checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}") os.system(f"mkdir -p {checkpoint_step_path}") torch.save( dataloader.dataset.state_dict(), os.path.join( checkpoint_step_path, f'dataset_state_dict.pt' ) ) def load_stream_dataloader_state_dict(dataloader, pipe, args): latest_step = pipe.global_step checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}") try: state_dict = torch.load( os.path.join( checkpoint_step_path, f'dataset_state_dict.pt' ) ) dataloader.data.load_state_dict(state_dict) except Exception as e: print('failed to load dataset state_dict.') ================================================ FILE: training/utils/dist_debug_utils.py ================================================ import torch def print_cuda_memory(args, info: str, device=None): if args.debug_mem: if device is None: device = torch.device('cuda', args.cuda_id) print("<{}>: current memory allocated: {:2.3f} MB, peak memory: {:2.3f} MB".format( info, torch.cuda.memory_allocated(device)/1048576, torch.cuda.max_memory_allocated(device)/1048576)) def print_multi_cuda_memory(args, info: str): if args.debug_mem: for local_gpu_rank in range(args.cuda_num): device = torch.device('cuda', local_gpu_rank) print("<{}>({}): current memory allocated: {:2.3f} MB, peak memory: {:2.3f} MB".format(info, local_gpu_rank, torch.cuda.memory_allocated(device)/1048576, torch.cuda.max_memory_allocated(device)/1048576)) ================================================ FILE: training/utils/event_report.py ================================================ #!/usr/bin/env python3 # This application reports events that are stored in the event log REST service. # Events will be reported to the event log REST service via POST at: # # http://:/v1/internal/fine-tunes//event # # with Bearer authorization tokens. # # The ouput formate is a JSON object with the following fields: # - "object": # - "created_at": # - "level": # - "message": # - "type": # - "param_count": (optional) # - "token_count": (optional) # - "checkpoint_path": (optional) # - "model_path": (optional) import argparse import json import requests import sys import time class EventReporter: # Event type constants EVENT_TYPE_JOB_START = "JOB_START" EVENT_TYPE_MODEL_DOWNLOAD_COMPLETE = "MODEL_DOWNLOAD_COMPLETE" EVENT_TYPE_TRAINING_DATA_DOWNLOAD_COMPLETE = "TRAINING_DATA_DOWNLOAD_COMPLETE" EVENT_TYPE_TRAINING_START = "TRAINING_START" EVENT_TYPE_CHECKPOINT_SAVE = "CHECKPOINT_SAVE" EVENT_TYPE_EPOCH_COMPLETE = "EPOCH_COMPLETE" EVENT_TYPE_TRAINING_COMPLETE = "TRAINING_COMPLETE" EVENT_TYPE_JOB_COMPLETE = "JOB_COMPLETE" EVENT_TYPE_JOB_ERROR = "JOB_ERROR" supported_event_types = [ EVENT_TYPE_JOB_START, EVENT_TYPE_MODEL_DOWNLOAD_COMPLETE, EVENT_TYPE_TRAINING_DATA_DOWNLOAD_COMPLETE, EVENT_TYPE_TRAINING_START, EVENT_TYPE_CHECKPOINT_SAVE, EVENT_TYPE_EPOCH_COMPLETE, EVENT_TYPE_TRAINING_COMPLETE, EVENT_TYPE_JOB_COMPLETE, EVENT_TYPE_JOB_ERROR, ] # Event level constants LEVEL_INFO = "Info" LEVEL_WARNING = "Warning" LEVEL_ERROR = "Error" supported_event_levels = [ LEVEL_INFO, LEVEL_WARNING, LEVEL_ERROR, ] # Object type constants OBJECT_FINE_TUNE = "fine-tune" supported_object_types = [ OBJECT_FINE_TUNE, ] object_type_to_endpoint = { "fine-tune": "fine-tunes", } def __init__(self, host=None, auth_token=None, job_id=None): self.host = host self.auth_token = auth_token self.job_id = job_id def is_enabled(self) -> bool: # Validate the URL. if self.host is None: return False # Validate the authorization token. if self.auth_token is None: return False # Validate the job ID. if self.job_id is None: return False return True # Report an event to the event log REST service. # The event will be reported to the event log REST service via POST at: # http://:/v1/internal/fine-tunes//event # with Bearer authorization tokens. # The ouput formate is a JSON object with the following fields: # - "object": object type to be reported. Supported object types are given by # `supported_object_types` # - "created_at": The creation timestamp for the event. If not specified, the # current time will be used. # - "level": Event level. Supported event levels are given by `supported_event_levels` # - "message": Event message. # - "type": Event type. Supported event types are given by `supported_event_types` # - "param_count": Report the number of model parameters. (optional) # - "token_count": Report the number of tokens in the training data. (optional) # - "checkpoint_path": The path to a checkpoint file(s) (optional) # - "model_path": The path to model file(s) (optional) # - "requires_is_enabled": When true, verify that is_enabled to return true # and raises an exception if it does not. When false, this function silently # exits without error. (optional) def report(self, object, message, event_type, level=LEVEL_INFO, checkpoint_path=None, model_path=None, param_count=None, token_count=None, requires_is_enabled=True): if requires_is_enabled: # Validate the host. if self.host is None: raise ValueError("Host is required") # Validate the authorization token. if self.auth_token is None: raise ValueError("Authorization token is required") # Validate the job ID. if self.job_id is None: raise ValueError("Job ID is required") elif not self.is_enabled(): return # Get the creation timestamp. created_at = int(time.time()) # Validate the object type. if object is None: raise ValueError("Object type is required") elif object not in self.supported_object_types: raise ValueError(f"Invalid object type : {object}") # Validate the message. if message is None: raise ValueError("Message is required") # Validate the event type. if event_type is None: raise ValueError("Event type is required") elif event_type not in self.supported_event_types: raise ValueError(f"Invalid event type : {event_type}") # Validate the event level. if level is None: level = self.supported_event_levels[0] elif level not in self.supported_event_levels: raise ValueError(f"Invalid event level : {level}") # Create the JSON object. event = { "object": object, "created_at": created_at, "level": level, "message": message, "type": event_type } if checkpoint_path is not None and len(checkpoint_path) > 0: event["checkpoint_path"] = checkpoint_path if model_path is not None and len(model_path) > 0: event["model_path"] = model_path if param_count is not None: event["param_count"] = int(param_count) if token_count is not None: event["token_count"] = int(token_count) event_str = json.dumps(event) # Send the event to the event log REST service. headers = { "Authorization": f"Bearer {self.auth_token}", "Content-Type": "application/json" } endpoint = f"{self.host}/v1/privlieged/{self.object_type_to_endpoint[object]}/{self.job_id}/event" response = requests.post(endpoint, headers=headers, data=event_str) if response.status_code != 200: raise ValueError(f"Failed to send event to event log REST service: ({response.status_code}) response=\"{response.text}\"\nEvent: {event_str}") print(f"Event reported: {event_str}") def add_entry_reporter_arguments(parser): parser.add_argument('--event-host', type=str, required=False, metavar='endpoint:port', help='Event reporting entrypoint URL') parser.add_argument('--event-auth-token', type=str, required=False, help='Bearer authorization token') def main(): parser = argparse.ArgumentParser() parser.add_argument('-u', '--event-host', type=str, required=True, metavar=':', help='Event reporting entrypoint URL (e.g. https://127.0.0.1:8895)') parser.add_argument('-a', '--auth-token', type=str, required=True, help='Bearer authorization token') parser.add_argument('-j', '--job-id', type=str, required=True, help='job id') parser.add_argument('-o', '--object', type=str, required=True, help='object type', metavar="|".join(EventReporter.supported_object_types)) parser.add_argument('-m', '--message', type=str, required=True, help='event message') parser.add_argument('-e', '--event-type', type=str, required=True, help='event type', metavar="|".join(EventReporter.supported_event_types)) parser.add_argument('-c', '--created-at', type=str, required=False, help='timestamp') parser.add_argument('-C', '--checkpoint-path', type=str, required=False, help='S3 checkpoint path') parser.add_argument('-M', '--model-path', type=str, required=False, help='S3 model path') parser.add_argument('-p', '--param-count', type=int, required=False, help='number of parameters') parser.add_argument('-t', '--token-count', type=int, required=False, help='number of tokens') parser.add_argument('-l', '--level', type=str, required=False, help='event level', metavar="|".join(EventReporter.supported_event_levels)) args = parser.parse_args() # Create the event reporter. event_reporter = EventReporter(host=args.event_host, auth_token=args.auth_token, job_id=args.job_id) event_reporter.report(object=args.object, message=args.message, event_type=args.event_type, level=args.level, checkpoint_path=args.checkpoint_path, model_path=args.model_path, param_count=args.param_count, token_count=args.token_count) #usage: event_report.py [-h] -u : -a AUTH_TOKEN -j # JOB_ID -o fine-tune -m MESSAGE -e # JOB_START|MODEL_DOWNLOAD_COMPLETE|TRAINING_DATA_DOWNLOAD_COMPLETE|TRAINING_START|CHECKPOINT_SAVE|EPOCH_COMPLETE|TRAINING_COMPLETE|JOB_COMPLETE|JOB_ERROR # [-c CREATED_AT] [-C CHECKPOINT_PATH] [-M MODEL_PATH] # [-p PARAM_COUNT] [-t TOKEN_COUNT] # [-l Info|Warning|Error] # #optional arguments: # -h, --help show this help message and exit # -u, --event-host : # Event reporting entrypoint URL (e.g. # https://127.0.0.1:8895) # -a, --auth-token AUTH_TOKEN # Bearer authorization token # -j, --job-id JOB_ID # job id # -o, --object fine-tune # object type # -m, --message MESSAGE # event message # -e, --event-type JOB_START|MODEL_DOWNLOAD_COMPLETE|TRAINING_DATA_DOWNLOAD_COMPLETE|TRAINING_START|CHECKPOINT_SAVE|EPOCH_COMPLETE|TRAINING_COMPLETE|JOB_COMPLETE|JOB_ERROR # event type # -c, --created-at CREATED_AT # timestamp # -C, --checkpoint-path CHECKPOINT_PATH # S3 checkpoint path # -M, --model-path MODEL_PATH # S3 model path # -p, --param-count PARAM_COUNT # number of parameters # -t, --token-count TOKEN_COUNT # number of tokens # -l, --level Info|Warning|Error # event level if __name__ == '__main__': try: main() except Exception as e: print(e) sys.exit(1) sys.exit(0) ================================================ FILE: training/utils/logging_utils.py ================================================ import os try: import wandb _has_wandb = True except: _has_wandb = False print("wandb is not installed.") try: import loguru _has_loguru = True except: _has_loguru = False print("loguru is not installed.") train_log_backend = None def init_train_logger(args): global train_log_backend train_log_backend = getattr(args, 'train_log_backend', 'print') if train_log_backend == 'print': pass elif train_log_backend == 'loguru': os.system("mkdir -p logs") loguru.logger.add("logs/file_{time}.log") elif train_log_backend == 'wandb': assert _has_wandb if not hasattr(args, 'project_name'): import re args.project_name = "test-" + \ re.sub('[^a-zA-Z0-9 \n\.]', '_', args.task_name) wandb.init( project=args.project_name, config=args, ) else: raise Exception('Unknown logging backend.') def train_log(x, *args, **kargs): if train_log_backend == 'print': print(x) elif train_log_backend == 'loguru': loguru.logger.info(x) elif train_log_backend == 'wandb': wandb.log(x, *args, **kargs) else: raise Exception('Unknown logging backend.') ================================================ FILE: training/utils/upload_manager.py ================================================ import argparse import boto3 import concurrent.futures import os import re import sys import time from utils.event_report import * class UploadManager: def __init__(self, aws_endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str, aws_session_token: str = None, aws_region: str = "auto", event_reporter: EventReporter = None, n_stages: int = 1, max_wait_sec: int = 600, dry_run: bool = False): self.executor = concurrent.futures.ThreadPoolExecutor() self.futures = [] if aws_endpoint_url is not None and aws_access_key_id is not None and aws_secret_access_key is not None and aws_region is not None: # Create an S3 client self.aws_access_key_id = aws_access_key_id self.aws_secret_access_key = aws_secret_access_key self.aws_session_token = aws_session_token self.aws_region = aws_region self.aws_endpoint_url = aws_endpoint_url self.enabled = True else: self.aws_access_key_id = None self.aws_secret_access_key = None self.aws_session_token = None self.aws_region = None self.aws_endpoint_url = None self.enabled = False self.event_reporter = event_reporter self.dry_run = dry_run if n_stages < 1 and self.enabled: raise ValueError("n_stages must be greater than or equal to 1") self.n_stages = n_stages self.max_wait_sec = max_wait_sec def add_task(self, directory: str, checkpoint_upload_prefix: str, step: int = 0): if self.enabled: # Check that the provided checkpoint upload s3 prefix is valid regex if not re.match(r"s3://[a-zA-Z0-9.\-_]{3,255}/.+", checkpoint_upload_prefix): raise ValueError("checkpoint_upload_prefix must start with s3://") # Get the s3 bucket and key from the checkpoint upload prefix s3_bucket = checkpoint_upload_prefix.split("/")[2] s3_key_prefix = "/".join(checkpoint_upload_prefix.split("/")[3:]) if not s3_key_prefix.endswith("/"): s3_key_prefix += "/" print(f"Uploading checkpoint to bucket=\"{s3_bucket}\", prefix=\"{s3_key_prefix}\"") future = self.executor.submit(self._execute_task, directory, s3_bucket, s3_key_prefix, step) self.futures.append(future) def wait(self): if self.enabled: concurrent.futures.wait(self.futures) def _report_event(self, **kwargs): if self.event_reporter is not None: self.event_reporter.report(object=EventReporter.OBJECT_FINE_TUNE, **kwargs) def _wait_for_file_write_to_finish(self, file_path: str, wait_start_time: float) -> bool: try: file_size = os.stat(file_path).st_size while True: time.sleep(2) file_size_after = os.stat(file_path).st_size if file_size == file_size_after: return True if time.time() - wait_start_time > self.max_wait_sec: return False file_size = file_size_after except Exception as e: print(f"Exception while waiting for file write to finish: {e}") return False def _execute_task(self, directory, s3_bucket, s3_key_prefix, step: int): try: # Create an S3 client session = boto3.Session( aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, aws_session_token=self.aws_session_token, region_name=self.aws_region ) s3_client = session.client('s3', endpoint_url=self.aws_endpoint_url) print(f"Step {step} - Wait for all checkpoint stages to finish ...") wait_start_time = time.time() finished_files = set() # Wait for all stages to finish # Each stage is written by a separate process. We don't know which process # will finish first. So we wait for all stages to finish before proceeding. while True: # Get the list of files in the directory files = os.listdir(directory) print(f"Step {step} - Found {len(files)} of expected {3 * self.n_stages + 1} files in directory: {directory}") # Check if all stages have finished all_finished = False if len(files) == 3 * self.n_stages + 1: all_finished = True # Check if all files are closed for file in files: print(f"Step {step} - Checking if {file} has is finished writing ...") if file not in finished_files: if self._wait_for_file_write_to_finish(os.path.join(directory, file), wait_start_time) == False: all_finished = False break else: print(f"Step {step} - Checking if {file} has is finished writing ... Done") finished_files.add(file) else: all_finished = False if all_finished: break # Check if we have timed out waiting for all stages to finish if time.time() - wait_start_time > self.max_wait_sec: print(f"Step {step} - Timeout waiting for all stages to finish") return time.sleep(10) print(f"Step {step} - Compressing files in directory: {directory}") tar_file_path = f"{directory}.tar.zst" # Get the tar file path tar_file_name = os.path.basename(tar_file_path) # Compress the directory via cli if not self.dry_run: if os.system(f"tar -cf - -C \"{directory}\" . | zstd -3 -T4 > \"{tar_file_path}\"") != 0: print(f"Step {step} - Failed to compress {directory}") return s3_key = f"{s3_key_prefix}{tar_file_name}" print(f"Step {step} - Uploading checkpoint to s3://{s3_bucket}/{s3_key}") if not self.dry_run: # Try uploading the tar file to s3. If it fails, try again after # 20 seconds. for i in range(3): try: s3_client.upload_file(tar_file_path, s3_bucket, s3_key) break except Exception as e: print(f"Step {step} - Failed to upload checkpoint to s3: {e}") if i == 2: self._report_event(message=f"Step {step}, failed to upload checkpoint", event_type=EventReporter.EVENT_TYPE_JOB_ERROR, level=EventReporter.LEVEL_ERROR, requires_is_enabled=False) return time.sleep(20) os.remove(tar_file_path) if self.event_reporter is not None: print(f"Step {step} - Reporting event") try: self._report_event(message=f"Uploaded checkpoint, at step {step}", event_type=EventReporter.EVENT_TYPE_CHECKPOINT_SAVE, checkpoint_path=f"s3://{s3_bucket}/{s3_key}", requires_is_enabled=False) except Exception as e: print(f"Step {step} - Failed to report event: {e}") else: print(f"Step {step} - Event reporter is disabled, skipping reporting event") except Exception as e: print(f"Exception: Step {step} - {e}") self._report_event(message=f"Step {step}, failed to upload checkpoint", event_type=EventReporter.EVENT_TYPE_JOB_ERROR, level=EventReporter.LEVEL_ERROR, requires_is_enabled=False) def add_aws_arguments(parser: argparse.ArgumentParser): parser.add_argument('--aws-endpoint-url', help='AWS endpoint URL') parser.add_argument('--aws-access-key-id', help='AWS access key ID') parser.add_argument('--aws-secret-access-key', help='AWS secret access key') parser.add_argument('--aws-session-token', help='AWS session token') parser.add_argument('--aws-region', default='auto', help='AWS region (default: auto)') def aws_process_args(args: argparse.Namespace, required: bool = False): if args.aws_endpoint_url is None: args.aws_endpoint_url = os.environ.get('AWS_ENDPOINT_URL', 'https://s3.amazonaws.com') if args.aws_access_key_id is None: args.aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID') if required and args.aws_access_key_id is None: print("Error: AWS_ACCESS_KEY_ID is not set") sys.exit(1) if args.aws_secret_access_key is None: args.aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY') if required and args.aws_secret_access_key is None: print("Error: AWS_SECRET_ACCESS_KEY is not set") sys.exit(1) if args.aws_session_token is None: args.aws_session_token = os.environ.get('AWS_SESSION_TOKEN') def main(): parser = argparse.ArgumentParser(description='Process S3 file objects with a specific prefix') parser.add_argument('--bucket-name', required=True, help='S3 bucket name') parser.add_argument('--prefix', required=True, help='Prefix for the S3 objects') add_aws_arguments(parser) add_entry_reporter_arguments(parser) parser.add_argument('--job-id', '-j', type=str, required=True, help='job id') parser.add_argument('--n-stages', type=int, default=1, help='Number of stages') parser.add_argument('--dry-run', action='store_true', default=False, help='Perform a dry run (only print file paths)') parser.add_argument('directories', nargs='+', help='Directories to upload') args = parser.parse_args() aws_process_args(args, required=True) event_reporter = None if args.event_host is not None and args.event_auth_token is not None and args.job_id is not None: event_reporter = EventReporter(host=args.event_host, auth_token=args.event_auth_token, job_id=args.job_id) task_manager = UploadManager(aws_endpoint_url = args.aws_endpoint_url, aws_access_key_id = args.aws_access_key_id, aws_secret_access_key = args.aws_secret_access_key, aws_session_token = args.aws_session_token, aws_region = args.aws_region, event_reporter = event_reporter, n_stages = args.n_stages, dry_run = args.dry_run) checkpoint_upload_prefix = f"s3://{args.bucket_name}/{args.prefix}/" step = 0 for directory in args.directories: print(f"Adding task for directory: {directory}") step += 1 task_manager.add_task(directory=directory, checkpoint_upload_prefix=checkpoint_upload_prefix, step=step) time.sleep(20) print("Waiting for tasks to complete...") start_time = time.time() task_manager.wait() end_time = time.time() print(f"Tasks completed in {end_time - start_time} sec") if __name__ == "__main__": main()