Repository: sunzeyeah/RLHF Branch: master Commit: cd1a6d54971e Files: 135 Total size: 1.4 MB Directory structure: gitextract_138ju87y/ ├── README.md ├── docs/ │ └── INSTALL_ON_WINDOWS.md ├── examples/ │ ├── data_prepare.sh │ ├── eval_pretrain.sh │ ├── eval_reward.sh │ ├── pred_pretrain.sh │ ├── pred_reward.sh │ ├── pretrain.sh │ ├── train_dpo.sh │ ├── train_reward.sh │ ├── train_rlhf.sh │ └── train_sft.sh ├── notebook/ │ ├── data_prepare.ipynb │ ├── reward.ipynb │ ├── rlhf.ipynb │ ├── sft.ipynb │ └── test.ipynb ├── requirements.txt └── src/ ├── __init__.py ├── data/ │ ├── __init__.py │ ├── data.py │ ├── data_types.py │ └── pipeline.py ├── data_prepare.py ├── eval_pretrain.py ├── models/ │ ├── __init__.py │ ├── loss.py │ ├── ppo.py │ ├── reward.py │ ├── rlhf_engine.py │ └── trainer.py ├── pretrain.py ├── pretrain_wo_trainer.py ├── resources/ │ ├── config/ │ │ ├── deepspeed/ │ │ │ ├── Experiment.md │ │ │ ├── README.md │ │ │ ├── stage-1.json │ │ │ ├── stage-2.json │ │ │ ├── stage-3-no_trainer.json │ │ │ └── stage-3.json │ │ ├── log.conf │ │ └── ppo_model/ │ │ ├── ppo_config_glm-350M-chinese.yml │ │ ├── ppo_config_glm-small.yml │ │ ├── ppo_config_pangu-2.6B.yml │ │ └── ppo_config_pangu-350M.yml │ ├── dict/ │ │ └── jieba_custom_dict.txt │ ├── eval/ │ │ ├── ceval/ │ │ │ ├── subject_mapping.json │ │ │ └── submission_example.json │ │ └── mmlu/ │ │ └── subject_mapping.json │ └── models/ │ ├── baichuan-13B-base/ │ │ ├── config.json │ │ ├── configuration_baichuan.py │ │ ├── modeling_baichuan.py │ │ ├── tokenization_baichuan.py │ │ └── tokenizer_config.json │ ├── baichuan-13B-chat/ │ │ ├── config.json │ │ ├── configuration_baichuan.py │ │ ├── modeling_baichuan.py │ │ ├── tokenization_baichuan.py │ │ └── tokenizer_config.json │ ├── baichuan-7B/ │ │ ├── config.json │ │ ├── configuration_baichuan.py │ │ ├── modeling_baichuan.py │ │ ├── tokenization_baichuan.py │ │ └── tokenizer_config.json │ ├── chatglm-6B/ │ │ ├── README.md │ │ ├── config.json │ │ ├── configuration_chatglm.py │ │ ├── modeling_chatglm.py │ │ ├── quantization.py │ │ ├── tokenization_chatglm.py │ │ └── tokenizer_config.json │ ├── chatglm2-6B/ │ │ ├── config.json │ │ ├── configuration_chatglm.py │ │ ├── modeling_chatglm.py │ │ ├── quantization.py │ │ ├── tokenization_chatglm.py │ │ └── tokenizer_config.json │ ├── chatglm3-6B/ │ │ ├── config.json │ │ ├── configuration_chatglm.py │ │ ├── modeling_chatglm.py │ │ ├── quantization.py │ │ ├── tokenization_chatglm.py │ │ └── tokenizer_config.json │ ├── glm-10B-chinese/ │ │ ├── README.md │ │ ├── added_tokens.json │ │ ├── config.json │ │ ├── configuration_glm.py │ │ ├── modeling_glm.py │ │ ├── tokenization_glm.py │ │ └── tokenizer_config.json │ ├── glm-350M-chinese/ │ │ ├── README.md │ │ ├── added_tokens.json │ │ ├── config.json │ │ ├── configuration_glm.py │ │ ├── modeling_glm.py │ │ ├── tokenization_glm.py │ │ └── tokenizer_config.json │ ├── pangu-13B/ │ │ ├── config.json │ │ ├── configuration_gptpangu.py │ │ ├── modeling_gptpangu.py │ │ ├── tokenization_gptpangu.py │ │ └── tokenizer_config.json │ ├── pangu-2.6B/ │ │ ├── config.json │ │ ├── configuration_gptpangu.py │ │ ├── modeling_gptpangu.py │ │ ├── tokenization_gptpangu.py │ │ └── tokenizer_config.json │ ├── pangu-350M/ │ │ ├── README-Reward.md │ │ ├── README-SFT.md │ │ ├── README.md │ │ ├── config.json │ │ ├── configuration_gptpangu.py │ │ ├── modeling_gptpangu.py │ │ ├── tokenization_gptpangu.py │ │ └── tokenizer_config.json │ └── qwen-7B-chat/ │ ├── config.json │ ├── configuration_qwen.py │ ├── generation_config.json │ ├── modeling_qwen.py │ ├── qwen_generation_utils.py │ ├── tokenization_qwen.py │ └── tokenizer_config.json ├── train_dpo.py ├── train_reward.py ├── train_rlhf.py ├── train_rlhf_trlx.py ├── train_sft.py ├── train_sft_wo_trainer.py └── utils/ ├── __init__.py ├── config.py ├── file_utils.py ├── loading.py ├── logger.py ├── method_configs.py ├── modeling_utils.py └── nlp_utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ ## Features 提供3大功能: - LLM模型预训练:支持常见模型的预训练,包括:decoder结构(LLaMA、GPT)、encoder结构(GLM) - LLM模型评测:参考GPT类模型,基于ZeroShot和FewShot实现 - ChatGPT模型训练pipeline:根据[Learning to Summarize from human feedback](https://arxiv.org/abs/2009.01325) ,实现3大流程: SFT、Reward Model和RLHF - 支持RLHF阶段 (1) 联合优化reward和policy (2) 单独优化policy,冻结reward - 支持[DPO](https://arxiv.org/abs/2305.18290)作为Reward+RLHF的替代方案,可显著降低显存占用,同时实现RL的效果 ## Setup ### 1. Install deepspeed ```bash git clone https://github.com/microsoft/DeepSpeed.git cd deepspeed rm -rf build TORCH_CUDA_ARCH_LIST="7.0" DS_BUILD_OPS=1 pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 | tee build.log ``` 如果想创建binary wheel,方便在其他机器上安装,可使用如下命令,会在```dist```目录生成类似可安装文件```deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl``` ```bash git clone https://github.com/microsoft/DeepSpeed.git cd deepspeed rm -rf build TORCH_CUDA_ARCH_LIST="7.0" DS_BUILD_OPS=1 python setup.py build_ext -j8 bdist_wheel 2>&1 | tee build.log ``` **PS**:需要根据下图,调整```TORCH_CUDA_ARCH_LIST="7.0"```为自己对应的NVIDIA GPU架构 ![image info](./images/torch_cuda_list.png "torch_cuda_list") 或运行```torch.cuda.get_device_capability()```获取自己GPU的架构 ### 2. Install jieba 在使用Pangu类模型的时候,其special_token格式为``````、``````等,而[tokenization_gptpangu.py](src/resources/models/pangu-350M/tokenization_gptpangu.py)中```tokenize()```函数会使用```jieba```进行分词。但直接```pip install jieba```,默认会将```<```和```>```直接切分开,使用```jieba.add_word("")```也没有作用,因为```jieba```直接hardcode了会自动切分的token,其中就包括了```<```和```>```。 因此需要执行: ```bash git clone https://github.com/fxsjy/jieba.git cd jieba ``` 将代码clone到本地,修改```jieba/__init__.py```中```re_han_default```的取值,具体改动如下: - 改动前: ```python re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) ``` - 改动后: ```python re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-<>]+)", re.U) ``` 修改完成后使用```pip install .```进行本地编译安装,替换原有```jieba```。安装完成后,在代码中加入```jieba.add_word("")```(该代码已加入[tokenization_gptpangu.py](src/resources/models/pangu-350M/tokenization_gptpangu.py)),即可解决将``````一类的special token切分为多个id的情况 ### 3. Install apex (Optional) ```bash git clone https://github.com/NVIDIA/apex cd apex pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . 2>&1 | tee build.log ``` 如果想创建binary wheel,方便在其他机器上安装,可使用如下命令,会在```dist```目录生成类似可安装文件```apex-0.0.1+7150e20-cp38-cp38-linux_x86_64.whl``` ```bash git clone https://github.com/NVIDIA/apex cd apex python setup.py --cpp_ext --cuda_ext bdist_wheel 2>&1 | tee build.log ``` ## Data & Model Download ### 1. 预训练模型下载 | 模型 | size | huggingface地址 | 百度网盘地址 | 提取码 | | ----------- | ----------- | ----------- | ----------- | ----------- | | [Pangu-350M](https://openi.pcl.ac.cn/PCL-Platform.Intelligence/PanGu-Alpha) | 659MB | [sunzeyeah/pangu-350M](https://huggingface.co/sunzeyeah/pangu-350M) | [Pangu-350M](https://pan.baidu.com/s/1IzgtW48S2PKyjxPPMe1rAQ) | c5jj | | [Pangu-2.6B](https://openi.pcl.ac.cn/PCL-Platform.Intelligence/PanGu-Alpha) | 9.8GB | [sunzeyeah/pangu-2_6B](https://huggingface.co/sunzeyeah/pangu-2_6B) | [Pangu-2.6B](https://pan.baidu.com/s/1Tzvja4q_LgQOwkWPQ4jShw) | 2rad | | [Pangu-13B](https://openi.pcl.ac.cn/PCL-Platform.Intelligence/PanGu-Alpha) | 23.6GB | [sunzeyeah/pangu-13B](https://huggingface.co/sunzeyeah/pangu-13B) | [Pangu-13B](https://pan.baidu.com/s/11fWAeYYKqI7pH0UiuJ5jEQ) | u3dx | | [GLM-350M-chinese](https://github.com/THUDM/GLM) | 679MB | [sunzeyeah/glm-350M-chinese](https://huggingface.co/sunzeyeah/glm-350M-chinese) | [GLM-350M-chinese](https://pan.baidu.com/s/11Lef-E7Tsz5OGOueCpiqaA) | ii8e | | [GLM-10B-chinese](https://github.com/THUDM/GLM) | 18.4G | [sunzeyeah/glm-10B-chinese](https://huggingface.co/sunzeyeah/glm-10B-chinese) | [GLM-10B-chinese](https://pan.baidu.com/s/1GuOefx42n_GzFfwnjoBltw) | fynj | | [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) | 25.6G | [sunzeyeah/chatglm-6B](https://huggingface.co/sunzeyeah/chatglm-6B) | [ChatGLM-6B](https://pan.baidu.com/s/1OlpkMeQD6-LEpNFWx5E-mg) | uq1k | **PS**: 本repo提供的预训练模型下载中, - 对于pytorch_model\*.bin - 如果源文件已包括,则不做改动 - 如果源文件不包括,则根据其提供的checkpoint转换为pytorch_model\*.bin - 其余文件可能相对原文件有改动,包括:modeling_\*.py、tokenization_\*.py、configuration_\*.py、config.json和tokenizer.config ### 2. 数据下载 | 数据集 | size | huggingface地址 | 百度网盘地址 | 提取码 | | ----------- | ----------- | ----------- | ----------- | ----------- | | [CLUE Benchmark](https://github.com/CLUEbenchmark/CLUE) | 500MB | | [CLUE Benchmark](https://pan.baidu.com/s/15F60nRbBd6d6UvyXdwbXQA) | m6gt | | SFT & Reward Data | 5GB | [sunzeyeah/chinese_chatgpt_corpus](https://huggingface.co/datasets/sunzeyeah/chinese_chatgpt_corpus) | [SFT & Reward Data](https://pan.baidu.com/s/1sl8PB-Dlt1xLIYczMODyRg) | ecyc | | [百科](https://github.com/brightmart/nlp_chinese_corpus) | 652MB | | [baike_qa_2019](https://pan.baidu.com/s/1N6I-fvx6FLHwuxJuDLLA8g) | 7jad | | [知道问答](https://github.com/SophonPlus/ChineseNlpCorpus) | 847MB | | [zhidao](https://pan.baidu.com/s/1sjR3vABiMbdV1HkUt6kCKQ) | neds | | [对联](https://github.com/wb14123/couplet-dataset/releases/download/1.0/couplet.tar.gz) | 221MB | | [couplets](https://pan.baidu.com/s/1oo6oaephPXpyU-sDd_37qg) | 54ey | | [古文](https://github.com/NiuTrans/Classical-Modern) | 125MB | | [Classical & Modern](https://pan.baidu.com/s/1ZyGkYOgzT9ZEVnpSpB4kQg) | a4cr | | [古诗词](https://github.com/chinese-poetry/chinese-poetry) | 87MB | | [chinese poetry](https://pan.baidu.com/s/13uvkA96PdKpKB7ZQ2GkXZQ) | 5zzj | | 微博新闻评论 | 522MB | | [weibo summary comments](https://pan.baidu.com/s/1h45O0q6gQl3LbH-NjzuRlw) | w0g1 | **PS**: SFT & Reward Data基于百科、知道问答、对联、古文、古诗词、微博新闻评论数据构造,可直接用于SFT和Reward阶段训练。详见[data_prepare.py](./src/data_prepare.py) ## Usage ### 1. LLM模型预训练 对开源LLM进行增量预训练,基于deepspeed实现。目前支持2类模型架构: - decoder结构:LLaMA、Baichuan、Pangu - encoder结构:GLM、ChatGLM ```bash cd examples bash pretrain.sh ``` ### 2. LLM模型评测 对开源中文LLM进行ZeroShot、OneShot或FewShot的评测。详见[eval_pretrain.py](./src/eval_pretrain.py) 和 [data.py](src/data/data.py)。 目前支持的评测任务: - [C-Eval](https://github.com/SJTU-LIT/ceval) - [MMLU](https://github.com/hendrycks/test) - [CLUEBenchmark](https://github.com/CLUEbenchmark/CLUE) :评测方法和prompt模板参考[Pangu-alpha论文](https://arxiv.org/abs/2104.12369) 目前支持的开源模型: - LLaMA及相关衍生模型 - ChatGLM(1和2) - Baichuan - Qwen - Pangu - GLM ```bash cd examples bash eval_pretrain.sh ``` ### 3. SFT 使用开源LLM + SFT&Reward数据进行SFT训练 ```bash cd examples bash train_sft.sh ``` ### 4. Reward Model 使用SFT模型 + SFT&Reward数据进行Reward模型训练 ```bash cd examples bash train_reward.sh ``` ### 5. RLHF 利用PPO算法和Reward Model,进一步更新SFT模型。基于开源框架[DeepSpeedChat](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat) 实现 ```bash cd examples bash train_rlhf.sh ``` ### 6. DPO 利用DPO算法替代Reward+RLHF的pipeline,免去训练Reward模型,同时达到RL训练的效果,该方法可显著降低显存占用。基于开源框架[trl](https://github.com/huggingface/trl) 实现 ```bash cd examples bash train_dpo.sh ``` ## Results ### 1. LLM模型评测
C-Eval 5-shot测试集(test)结果
Model Avg Avg(Hard) STEM Social Science Humanities Other
Baichuan2-13B-Chat 56.30 34.20 48.20 70.00 60.50 54.20
xverse-13B 55.30 32.50 45.90 66.70 59.50 57.60
Qwen-7B-Chat 54.70 35.40 47.90 68.30 58.70 50.00
Baichuan-13B-Base 53.70 35.60 46.80 65.80 58.00 50.80
Baichuan2-7B-Chat 52.50 33.80 45.70 64.20 56.60 50.20
ChatGLM2-6B 51.20 33.40 46.90 63.00 51.60 47.70
Baichuan-13B-Chat 47.90 31.50 41.40 56.80 53.00 46.50
Baichuan-7B 44.20 31.70 39.20 53.30 47.30 41.90
Ziya-LLaMA-13B-v1.1 40.10 30.30 35.80 47.30 42.80 38.50
ChatGLM1.1-6B 38.10 28.60 33.60 46.70 40.90 35.70
AtomGPT-13B-56k 37.60 25.30 32.00 44.70 42.80 36.10
LLaMA2-13B-chat 37.10 29.30 34.60 43.60 35.90 37.00
ChatGLM-6B 36.30 27.20 32.90 42.80 38.10 34.90
LLaMA-30B 35.90 29.90 34.40 42.40 33.30 35.60
LLaMA2-7B-chat 33.50 27.30 31.60 38.10 33.80 32.70
Ziya-LLaMA-13B-Pretrain-v1 31.10 22.20 27.40 36.50 33.80 30.40
LLaMA-13B 29.8 24.20 28.40 33.70 29.60 29.00
LLaMA-7B 26.80 26.70 26.20 27.60 25.70 28.10
MMLU 5-shot测试集(test)结果
Model Avg STEM Social Science Humanities Other
Baichuan2-13B-Chat 56.90 47.28 66.23 52.90 63.50
LLaMA-30B 56.33 44.68 65.64 54.60 61.57
xverse-13B 55.24 45.60 64.51 50.32 63.27
Qwen-7B-Chat 54.13 41.76 63.43 50.81 62.50
LLaMA2-13B-chat 53.98 44.52 63.40 49.37 61.21
Baichuan-13B-Base 53.46 43.86 63.14 49.73 59.28
Baichuan2-7B-Chat 53.11 43.51 62.26 49.58 59.12
Baichuan-13B-Chat 51.12 41.61 59.11 47.52 58.31
Ziya-LLaMA-13B-v1.1 51.06 41.89 57.71 49.22 56.54
LLaMA2-7B-chat 48.10 39.64 56.28 43.61 55.39
LLaMA-13B 46.51 37.23 52.71 44.35 53.04
ChatGLM2-6B 45.83 38.75 52.06 43.20 50.82
AtomGPT-13B-56k 42.75 36.02 49.04 38.80 49.30
Baichuan-7B 41.96 36.63 47.77 37.55 48.31
Ziya-LLaMA-13B-Pretrain-v1 41.61 33.61 46.01 39.85 48.05
ChatGLM1.1-6B 40.07 32.95 44.55 39.23 44.12
ChatGLM-6B 37.87 32.41 43.80 35.60 41.00
LLaMA-7B 28.53 26.10 28.76 28.52 24.81
CLUEBenchmark 验证集(dev.json)结果
Dataset Method Metrics Task Type Zero-shot Few-shot
GLM-350M-chinese Pangu-350M Pangu-2.6B GLM-10B-chinese Pangu-13B GLM-350M-chinese Pangu-350M Pangu-2.6B GLM-10B-chinese Pangu-13B
OCNLI PPL acc NLI 0.3074 0.3369 0.3061 0.3288 0.3301 0.3298 0.3352 0.3216
CMNLI PPL acc NLI 0.3279 0.3302 0.3310 0.3338 0.3358 0.3356 0.3328 0.3300
CHID PPL acc Cloze(multi-choices) 0.0734 0.0916 0.0670 0.1016 0.1018 0.0979 0.1007 0.0996
CMRC2018 generation f1 MRC 0.093 0.0979 0.1007 0.1392 0.021 0.09345 0.097 0.1007
CLUEWSC2020 PPL acc WSC 0.4934 0.5328 0.5592 0.5131 0.4671 0.5526 0.4473 0.4671
C3 PPL acc Common sense reasoning 0.2360 0.2426 0.2418 0.2573 0.2567 0.2476 0.2559 0.2515
AFQMC PPL acc Text classification 0.6306 0.4582 0.4914 0.4960 0.5000 0.4872 0.4993 0.5018
CSL PPL acc Text classification 0.4943 0.4913 0.4666 0.5126 0.4996 0.5140 0.5036 0.4973
IFLYTEK PPL acc Text classification 0.1292 0.3058 0.265 0.2620 0.2408 0.2539 0.2535 0.2524
TNEWS PPL acc Text classification 0.1582 0.2022 0.2449 0.2489 0.2527 0.2555 0.2466 0.2494
### 2. SFT 模型训练参数: | 模型 | 可训练参数量 | 数据量 | batch size | sequence length | 硬件 | 显存占用 | speed | Hours per epoch | | --- | --- | --- | --- | :---: | :---: | :---: | --- | --- | | GLM-350M-chinese | 355M | 5.4M | 4 | 512 | V100 16G | 13G | 3.7 s/iter | 88h | | Pangu-350M | 345M | 5.4M | 48 | 512 | A100 80G | 78G | 1.91 s/iter | 27.5h | | Pangu-2.6B | 2.6B | 5.4M | 8 | 512 | A100 80G | 79.4G | 9.61 s/iter | 116h | SFT模型下载: | 预训练模型 | SFT数据集 | size | huggingface地址 | 百度网盘地址 | 提取码 | | ----------- | --- | ----------- | ----------- | ----------- | ----------- | | Pangu-350M | SFT & Reward Data | 1.3GB | [sunzeyeah/pangu-350M-sft](https://huggingface.co/sunzeyeah/pangu-350M-sft) | [Pangu-350M-SFT](https://pan.baidu.com/s/14nF63nAQz38jmplUC0EQ-g) | 0f8j | | Pangu-2.6B | SFT & Reward Data | 9.8GB | [sunzeyeah/pangu-2_6B-sft](https://huggingface.co/sunzeyeah/pangu-2_6B-sft) | [Pangu-2.6B-SFT](https://pan.baidu.com/s/1Q2PKf0MnFsiFlNhU-6WIrg) | r13w |
Pangu-2.6B-SFT生成示例 ``` # 生成参数 max_length = 512 max_length_generation = 100 top_k = 10 temperature = 0.8 do_sample=True ``` ``` [Input] 我不能确定对方是不是喜欢我,我却想分分秒秒跟他在一起,有谁能告诉我如何能想他少一点回答: [Generated] 你爱他就多关心他,少在他面前表现出你的脆弱。这样他才更会爱你。 [Input] 一次次买彩,一次次倾听那比分,一次次兴奋,一次次失落,一次次无归而返!最终还是任8~~~~回答: [Generated] 我也是这样,买彩的时间也不长,但是每次都是这样。。。。。。。 [Input] 现代文:太祖很高兴地说:凭着这样的部下去作战,还担心不能取胜么?古文: [Generated] 太祖悦曰:此自是以去,何忧不胜? [Input] 现代文:十一年秋,丞相伯颜与阿术会师襄阳,遣阿剌罕率诸翼军攻郢、复诸州,十月夺取郢州之南门堡。古文: [Generated] 十一年秋,丞相伯颜与阿术会师襄阳,遣阿剌罕率诸翼军攻郢、复诸州,十月夺郢州之南门堡。 [Input] 以孔⁇的风格,写一篇经书,题为中庸 [Generated] 我闻昔者夫子之言也,曰:吾道一以贯之。其或行之或不行,吾未知之。是故君子慎其独也。吾未见好德如好色者也。好德如好色者,未之有也。好德如好色者,既已之有也。君子不以言举人,不以人废言。好德如好色者,其如命何!吾 [Input] 以陆游的风格,写一首七言诗,题为晨起 [Generated] 秋光满纸清⁇明,睡醒一江万马奔。自笑身如枯木鹤,何妨食饱作闲人。 [Input] 上联:霜风几笔雕残画下联: [Generated] 云海千山叠翠峰 [Input] 新闻内容:据悉,这名14岁的女生在Facebook上发布了戏谑考试的博文,其中使用了胡志明1946年反对法国殖民者的讲话,随后被越南当局停课一年。越南官员称,该学生歪曲了历史事件,侮辱了老师的情感。多数网友虽对越南当局做法不满,但都表示,至少他们还能上Facebook.摘要:越南女生Facebook上模仿胡志明讲话被停课评论: [Generated] 这女的,还真有勇气,我喜欢你! [Input] 新闻内容:习近平天津考察时勉励大学生志存高远、脚踏实地,转变择业观念,坚持从实际出发,勇于到基层一线和艰苦地方去,把人生的路一步步走稳走实,善于在平凡岗位上创造不平凡的业绩摘要:习近平勉励大学生转变择业观念勇于到基层和艰苦地方去评论: [Generated] 现在的社会,很多年轻人都是在父母的安排下,去了不同的单位,在不同的地方工作,在不同的城市生活,这就导致了年轻人的思维方式,价值观都不一样,这也导致了年轻人的成长道路,也有很多的不安全感,这也是很正常的事情,年轻人需要正确的引导,正确的引导,也需要正确的引导! [Input] 问题:劳动仲裁申请法院强制执行,法院强制执行需要多长时间回答: [Generated] 一般申请法院强制执行需要3-6个月时间,具体需要看法院的执行措施。 [Input] 问题:银行贷款50万,30年!年利率是4.41怎么算一个月还多少钱回答: [Generated] 若申请的是招行贷款,要计算贷款利息或者每月还款金额,需要知道贷款本金,贷款期限,还款方式,贷款年利率。若上述信息都确认到,可以通过我行贷款计算器尝试计算,登录招行官网右下方找到“理财计算器”--“个人贷款计算器”可以计算。 ```
### 3. Reward Model 模型训练参数: | 模型 | 可训练参数量 | 数据量 | batch size | sequence length | 硬件 | 显存占用 | speed | Hours per epoch | | --- | --- | --- | --- | :---: | :---: | :---: | --- | --- | | Pangu-350M | 131M | 12M | 32 | 512 | A100 80G | 72.6G | 1.91 s/iter | 105h | | Pangu-2.6B | 815M | 12M | 8 | 512 | A100 80G | 80.7G | | 423h | Reward模型下载: | 预训练模型 | SFT数据集 | size | huggingface地址 | 百度网盘地址 | 提取码 | | ----------- | --- | ----------- | ----------- | ----------- | ----------- | | Pangu-350M | SFT & Reward Data | 1.3GB | [sunzeyeah/pangu-350M-reward](https://huggingface.co/sunzeyeah/pangu-350M-reward) | [Pangu-350M-Reward](https://pan.baidu.com/s/1wC3w78t7pVn0Xn5tJHy06A) | 4gju | ### 4. RLHF To be updated ### 5. DeepSpeed实验 为验证不同预训练模型使用deepspeed的训练效率是否能达到官方宣称的效果(加速、节省GPU等),进行了benchmarking - 实验场景:SFT阶段训练 - 实验参数:```max_sequence_length=512```
DeepSpeed实验结果
模型 数据 整体耗时/epoch 单条样本耗时 内存使用量 显存使用量 GPU型号和数量 fp16 bf16 deepspeed stage offload optimizer pin memory offloard param overlap comm allgather bucket size stage3 max live parameters batch size gradient accumulation steps gradient checkpointing model half
T5-large wmt16-en-ro, 共计61万条样本 43h 0.5s/it 7.1G 1*14529MB 1*V100 16G true - - - - - - - - 2 8 false false
152h 1.78s/it 38.26G 1*11663MB 1*V100 16G true - 2 true true - false 2e8 - 2 8 false false
250h 2.95s/it 38.74G 1*7255MB 1*V100 16G true - 2 true true - false 1e5 - 2 8 false false
62h 5.8s/it 86.81G 8*7811MB 8*V100 16G true - 2 true true - false 1e5 - 2 8 false false
- - - OOM 1*V100 16G true - 2 true true - false 2e8 - 16 8 false false
- - - OOM 1*V100 16G true - 2 true true - false 1e5 - 16 8 false false
290h 3.48s/it 46.53G 1*6655MB 1*V100 16G true - 3 true true true false 2e8 2e8 2 8 false false
380h 4.5s/it 43.48G 1*5263MB 1*V100 16G true - 3 true true true false 1e5 1e5 2 8 false false
215h 4.9s/it 47.31G 2*5019MB 2*V100 16G true - 3 true true true false 1e5 1e5 2 8 false false
1370h 64s/it 57.55G 4*4701MB 4*V100 16G true - 3 true true true false 1e5 1e5 2 8 false false
948h 90s/it 72.54G 8*4585MB 8*V100 16G true - 3 true true true false 1e5 1e5 2 8 false false
Pangu-2.6B SFT & Reward Data的验证集,共1万条样本 2h 5.76s/it 67.86G 1*15631MB 1*V100 16G true - 2 true true - false 2e8 - 2 8 false false
2.1h 6.15s/it 67.88G 1*15705MB 1*V100 16G true - 2 true true - false 1e5 - 2 8 false false
4.5h 13.3s/it 81.02G 1*15449MB 1*V100 16G true - 3 true true true false 2e8 2e8 2 8 false false
11.5h 8.2s/it 75.89G 1*15299MB 1*V100 16G true - 3 true true true false 1e5 1e5 2 8 false false
5.5h 7.8s/it 81.16G 2*14851MB 2*V100 16G true - 3 true true true false 1e5 1e5 2 8 false false
6.2h 18.3s/it 97.31G 4*14389MB 4*V100 16G true - 3 true true true false 1e5 1e5 2 8 false false
6.6h 38s/it 118.82G 8*14335MB 8*V100 16G true - 3 true true true false 1e5 1e5 2 8 false false
ChatGLM-6B SFT & Reward Data的验证集,共1万条样本 - - 120.45G OOM 1*V100 16G true - 2 true true - false 1e5 - 1 8 true true
- - 120.48G OOM 1*V100 16G true - 2 true true - false 1e3 - 1 8 false true
- - 153.02G OOM 1*V100 16G true - 3 true true true false 1e2 1e2 1 8 false true
- - 154G OOM 1*V100 16G true - 3 true true true false 2e8 2e8 1 8 true true
21.2h 60s/it 154G 1*10443MB 1*V100 16G true - 3 true true true false 2e8 auto 1 8 true true
21.5h 60s/it 152.81G 1*10409MB 1*V100 16G true - 3 true true true false 1e5 1e5 1 8 true true
23.5h 65s/it 153.36G 1*9229MB 1*V100 16G true - 3 true true true false 1e3 1e3 1 8 true true
14h 80s/it 158.21G 2*8631MB 2*V100 16G true - 3 true true true false 1e3 1e3 1 8 true true
7.8h 90s/it 168.38G 4*6743MB 4*V100 16G true - 3 true true true false 1e3 1e3 1 8 true true
4h 90s/it 189.34G 8*6729MB 8*V100 16G true - 3 true true true false 1e3 1e3 1 8 true true
1h 100s/it 189.38G 8*10047MB 8*V100 16G true - 3 true true true false 1e3 1e3 4 8 true true
50min 40s/it 189.39G 8*14763MB 8*V100 16G true - 3 true true true false 1e3 1e3 8 2 true true
35min 113s/it 189.39G 8*14763MB 8*V100 16G true - 3 true true true false 1e3 1e3 8 8 true true
- - 189.34G OOM 8*V100 16G true - 3 true true true false 1e3 1e3 10 8 true true
GLM-10B-Chinese SFT & Reward Data的验证集,共1万条样本 - - - OOM 1*V100 16G true - 3 true true true false 2e8 2e8 1 8 true false
- - - OOM 1*V100 16G true - 3 true true true false 2e8 auto 1 8 true false
- - - OOM 1*V100 16G true - 3 true true true false 1e5 1e5 1 8 true false
- - - OOM 1*V100 16G true - 3 true true true false 1e3 1e3 1 8 true false
- - - OOM 1*V100 16G true - 3 true true true false 1e2 1e2 1 8 true false
- - - OOM 2*V100 16G true - 3 true true true false 1e2 1e2 1 8 true false
- - - OOM 4*V100 16G true - 3 true true true false 1e2 1e2 1 8 true false
- - OOM - 8*V100 16G true - 3 true true true false 1e2 1e2 1 8 true false
- - - OOM 4*V100 16G true - 3 true true true false 1e2 1e2 1 8 true true
- - - OOM 6*V100 16G true - 3 true true true false 1e2 1e2 1 8 true true
- - OOM - 8*V100 16G true - 3 true true true false 1e2 1e2 1 8 true true
**PS**: deepspeed的参数介绍和调优经验,可参见[DeepSpeed Configuration](src/resources/config/deepspeed/README.md) ### 6. LoRA实验 为验证LoRA的训练效率提升,进行了benchmarking - 实验场景:SFT阶段训练 - 实验数据:SFT & Reward Data的验证集,共1万条样本 - 实验参数:```max_sequence_length=512, lora_alpha=1, lora_train_bias='none'```
LoRA实验结果
模型 LoRA rank 可训练参数量 deepspeed batch size GPU型号和数量 显存使用量 单条样本耗时 整体耗时/epoch
Pangu-2.6B - 2.6B - 8 1*A100 80G 1*79421MB 9.66s/it 12.5min
1000 1.5B - 8 1*A100 80G 1*76129MB 11.61s/it 15min
500 758MB - 12 1*A100 80G 1*77179MB 16.2s/it 14min
100 151MB - 16 1*A100 80G 1*81103MB 18.6s/it 12min
50 75MB - 16 1*A100 80G 1*80809MB 17.8s/it 11.5min
10 15MB - 16 1*A100 80G 1*78735MB 17.6s/it 11.5min
100 151MB stage=2, w offloading 24 1*A100 80G 1*76933MB 25.5s/it 11min
100 151MB stage=3, w offloading 24 1*A100 80G 1*77259MB 46.5s/it 20min
ChatGLM-6B - 6.2B - 3 1*A100 80G 1*79206MB 6.7s/it 23.5min
1000 1.9B - 6 1*A100 80G 1*78840MB 12.8s/it 22.5min
500 994MB - 6 1*A100 80G 1*68832MB 12.4s/it 21.5min
================================================ FILE: docs/INSTALL_ON_WINDOWS.md ================================================ 大模型相关的常见package在***Windows***上的安装方案。在以下配置上已测试成功: ```windows11 + python3.11 + cuda12.3 + cudnn8.9.5.30 + torch2.1.0+cu121 + ninja1.11.1 + cmake3.28.0-rc5 + Visual Studio 2022``` ## 0. Prerequisite #### a. 确认NVIDIA GPU架构 可根据下图 ![image info](../images/torch_cuda_list.png "torch_cuda_list") 或运行```torch.cuda.get_device_capability()```获取 #### b. 软件安装 需安装Visual Studio、ninja、cmake和CUDA - Microsoft Visual Studio 2022安装注意事项如下,`单个组件`需勾选6个额外模块 ,[参考链接](https://blog.csdn.net/feking2/article/details/130160845): ![image info](../images/microsoft_visual_studio_components.png "microsoft_visual_studio_components") - 安装顺序:必须先安装Visual Studio,再安装CUDA ## 1. apex 目前为止没有成功 ## 2. cutlass 打开Git Bash ```bash # 设定环境变量 export CUDACXX=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v12.3/bin/nvcc git clone https://github.com/NVIDIA/cutlass.git mkdir build && cd build # 显卡架构的数字根据自己情况修改 cmake .. -DCUTLASS_NVCC_ARCHS=89 -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_ENABLE_LIBRARY=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON cmake --build . --config Release -j 8 ``` 在以下commit-id: `b5d8a5d9ccb435268e2215144dca162b0fc6a826`编译成功 ## 3. flash attention 打开Git Bash ```bash git clone https://github.com/Dao-AILab/flash-attention.git cd flash-attention # IMPORTANT: do not manually copy cmake-built (or incorrect version?) cutlass code to flash-attention/csrc/cutlass MAX_JOBS=8 python setup.py install ``` 在以下commit-id: `017716451d446e464dde9aca3a3c1ed2209caaa9`安装成功 - benchmark - 测试代码: https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py - 测试设备:单张4090
结果
is causal head dim batch size seq length Model Speed (TFLOPS/s)
fwd bwd fwd+bwd
false 64 32 512 Flash2 149.21 91.05 102.46
false 64 32 512 Pytorch 20.04 22.91 22.01
false 64 32 512 xformers.cutlass 126.21 74.03 83.95
false 64 32 512 xformers.flash 161.29 99.23 111.49
false 64 16 1024 Flash2 167.81 124.49 134.40
false 64 16 1024 Pytorch 21.43 25.68 24.30
false 64 16 1024 xformers.cutlass 132.98 78.03 88.48
false 64 16 1024 xformers.flash 164.71 124.29 133.66
false 64 8 2048 Flash2 169.75 123.30 133.76
false 64 8 2048 Pytorch 25.58 27.19 26.71
false 64 8 2048 xformers.cutlass 136.01 77.43 88.29
false 64 8 2048 xformers.flash 168.51 13.641 144.26
false 64 4 4096 Flash2 141.27 142.01 141.79
false 64 4 4096 Pytorch 26.39 0.64 0.89
false 64 4 4096 xformers.cutlass 134.51 67.28 78.49
false 64 4 4096 xformers.flash 5.52 152.40 17.71
true 64 32 512 Flash2 2.13 43.81 6.65
true 64 32 512 Pytorch 7.49 10.79 9.58
true 64 32 512 xformers.cutlass 100.97 56.74 64.85
true 64 32 512 xformers.flash 2.38 7.61 4.68
true 64 16 1024 Flash2 3.36 89.96 10.73
true 64 16 1024 Pytorch 8.35 12.27 10.82
true 64 16 1024 xformers.cutlass 115.96 58.43 68.08
true 64 16 1024 xformers.flash 3.06 90.85 9.87
true 64 8 2048 Flash2 5.24 112.91 16.44
true 64 8 2048 Pytorch 8.76 13.39 11.63
true 64 8 2048 xformers.cutlass 123.37 67.61 77.63
true 64 8 2048 xformers.flash 4.37 115.97 13.99
true 64 4 4096 Flash2 7.26 134.79 22.40
true 64 4 4096 Pytorch 0.22 0.35 0.30
true 64 4 4096 xformers.cutlass 129.03 70.07 80.60
true 64 4 4096 xformers.flash 6.25 103.89 19.02
## 4. xformers 打开任意终端,如:cmd、powershell或git bash ```cmd pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121 ``` ## 5. deepspeed 打开Git Bash ```bash git clone https://github.com/microsoft/DeepSpeed.git cd deepspeed rm -rf build # 显卡架构的数字根据自己情况修改 TORCH_CUDA_ARCH_LIST=8.9 DS_BUILD_OPS=0 DS_BUILD_AIO=0 DS_BUILD_CCL_COMM=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_CPU_ADAGRAD=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_TRANSFORMER_INFERENCE=0 DS_BUILD_INFERENCE_CORE_OPS=0 DS_BUILD_CUTLASS_OPS=0 python setup.py build_ext -j8 bdist_wheel 2>&1 | tee build.log # 完成后会在dist目录生成whl文件 pip install dist/deepspeed-0.12.4+a3926bbb-cp311-cp311-win_amd64.whl ``` 在以下commit-id: `a3926bbbf6d0025b5c6076a280e6b91ebd08aada`安装成功 ***PS***:由于deepspeed需要调用NCCL backend进行分布式训练,但NCCL不支持Windows,所以目前虽然可以安装deepspeed,但无法进行训练 ================================================ FILE: examples/data_prepare.sh ================================================ #!/bin/bash MODEL="pangu-2.6B" ROOT="/mnt/sfevol775196/sunzeye273" #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" #ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp/" DATR_DIR=$ROOT/Data/chatgpt/raw #MAIN=$ROOT/Code/chatgpt/src/data_prepare.py MAIN=$ROOT/Code/RLHF/src/data_prepare.py MODEL_PATH=$ROOT/Data/models/$MODEL #MODEL_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL OUTPUT_DIR=$ROOT/Data/chatgpt/processed #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR python $MAIN \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --model_name_or_path $MODEL_PATH ================================================ FILE: examples/eval_pretrain.sh ================================================ #!/bin/bash # C-Eval and MMLU benchamarks TASK="ceval" MODEL="llama-7B" ROOT="/mnt/sfevol775196/sunzeye273" #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" #ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp" DATR_DIR=$ROOT/Data/chatgpt/raw/$TASK #MAIN=$ROOT/Code/chatgpt/src/eval_pretrain.py MAIN=$ROOT/Code/RLHF/src/eval_pretrain.py MODEL_PATH=$ROOT/Data/models/$MODEL #MODEL_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL OUTPUT_DIR=$ROOT/Data/chatgpt/output/pretrain/$MODEL EVAL_FILENAME="val" TRAIN_FILENAME="dev" CHECKPOINT=$ROOT/Data/chatgpt/output/pretrain/$MODEL SHOTS=5 MAX_LENGTH=1280 cd $ROOT/Code/RLHF || exit # cd $ROOT/Code/chatgpt || exit mkdir -p $OUTPUT_DIR #CUDA_VISIBLE_DEVICES=1 deepspeed --master_port 5008 $MAIN \ #deepspeed --num_gpus 1 $MAIN \ python $MAIN \ --device_map "auto" \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --model_name_or_path $MODEL_PATH \ --task $TASK \ --train_filename $TRAIN_FILENAME \ --eval_filename $EVAL_FILENAME \ --checkpoint $CHECKPOINT \ --max_length $MAX_LENGTH \ --max_few_shot $SHOTS \ --max_length_generation 1 \ > out/eval_pretrain_${TASK}_${MODEL}_${EVAL_FILENAME}_${SHOTS}-shots_${MAX_LENGTH}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ## Traditional NLP benchmark Evaluations #for TASK in "cluewsc2020" "afqmc" "csl" "iflytek" "ocnli" "cmnli" "tnews" "c3" "cmrc2018" "chid" #do # for MODEL in "pangu-350M" "pangu-2.6B" "glm-350M-chinese" "glm-10B-chinese" "pangu-13B" # do # ROOT="/mnt/sfevol775196/sunzeye273" # #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" # #ROOT="/mnt/pa002-28359-vol543625-private" # #ROOT="/root/autodl-tmp" # DATR_DIR=$ROOT/Data/chatgpt/raw/$TASK ## MAIN=$ROOT/Code/chatgpt/src/eval_pretrain.py # MAIN=$ROOT/Code/RLHF/src/eval_pretrain.py # MODEL_PATH=$ROOT/Data/models/$MODEL # #MODEL_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL # OUTPUT_DIR=$ROOT/Data/chatgpt/output/pretrain/$MODEL # EVAL_FILENAME="dev.json" # TRAIN_FILENAME="train.json" # case $MODEL in # "pangu-2.6B") # BATCH_SIZE=8 # ;; # *) # BATCH_SIZE=32 # ;; # esac # # cd $ROOT/Code/RLHF || exit ## cd $ROOT/Code/chatgpt || exit # mkdir -p $OUTPUT_DIR # # #CUDA_VISIBLE_DEVICES=1 deepspeed --master_port 5008 $MAIN \ # #deepspeed --num_gpus 1 $MAIN \ # python $MAIN \ # --device_map auto \ # --data_dir $DATR_DIR \ # --output_dir $OUTPUT_DIR \ # --model_name_or_path $MODEL_PATH \ # --task $TASK \ # --max_length 512 \ # --train_filename $TRAIN_FILENAME \ # --eval_filename $EVAL_FILENAME \ # --eval_batch_size $BATCH_SIZE \ # --top_p 0.8 \ # --temperature 0.8 \ # --num_return_sequences 1 \ # --max_length_generation 100 \ # > out/eval_pretrain_${MODEL}_${TASK}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 # done #done ================================================ FILE: examples/eval_reward.sh ================================================ #!/bin/bash MODEL="chatglm-6B" ROOT="/mnt/sfevol775196/sunzeye273" #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" #ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp" DATR_DIR=$ROOT/Data/chatgpt/processed #MAIN=$ROOT/Code/chatgpt/src/train_reward.py MAIN=$ROOT/Code/RLHF/src/train_reward.py TOKENIZER_PATH=$ROOT/Data/models/$MODEL #TOKENIZER_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL MODEL_PATH=$ROOT/Data/chatgpt/output/reward/${MODEL} OUTPUT_DIR=$ROOT/Data/chatgpt/output/reward/$MODEL EVAL_FILENAME="dev_data_external_v1.jsonl" #CHECKPOINT="${ROOT}/Data/chatgpt/output/reward/${MODEL}/pytorch_modelstar.bin" #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR #python $MAIN \ CUDA_LAUNCH_BLOCKING=1 deepspeed $MAIN \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --model_name_or_path $MODEL_PATH \ --tokenizer_path $TOKENIZER_PATH \ --max_length 512 \ --logging_steps 10 \ --deepspeed_config "stage-3.json" \ --do_eval \ --eval_filename $EVAL_FILENAME \ --eval_batch_size 20 \ > out/eval_reward_${MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ================================================ FILE: examples/pred_pretrain.sh ================================================ #!/bin/bash MODEL="llama-7B" ROOT="/mnt/sfevol775196/sunzeye273" #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" #ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp" DATR_DIR=$ROOT/Data/chatgpt/processed #MAIN=$ROOT/Code/chatgpt/src/pretrain.py MAIN=$ROOT/Code/RLHF/src/pretrain.py MODEL_PATH=$ROOT/Data/models/$MODEL #MODEL_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL OUTPUT_DIR=$ROOT/Data/chatgpt/output/pretrain/$MODEL/checkpoint-2000 CHECKPOINT=$OUTPUT_DIR/pytorch_model.bin TEST_FILENAME="test_prompts.jsonl" OUTPUT_FILENAME="output_${MODEL}.jsonl" #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR #CUDA_VISIBLE_DEVICES=1 deepspeed --master_port 5008 $MAIN \ python $MAIN \ --device_map "auto" \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --model_name_or_path $MODEL_PATH \ --checkpoint $CHECKPOINT \ --max_length 1024 \ --max_length_generation 512 \ --bits 16 \ --do_pred \ --test_filename $TEST_FILENAME \ --output_filename $OUTPUT_FILENAME \ > out/pred_pretrain_${MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ================================================ FILE: examples/pred_reward.sh ================================================ #!/bin/bash MODEL="chatglm2-6B" ROOT="/mnt/sfevol775196/sunzeye273" #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" #ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp" DATR_DIR=$ROOT/Data/chatgpt/processed #MAIN=$ROOT/Code/chatgpt/src/train_reward.py MAIN=$ROOT/Code/RLHF/src/train_reward.py TOKENIZER_PATH=$ROOT/Data/models/$MODEL #TOKENIZER_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL MODEL_PATH=$ROOT/Data/chatgpt/output/sft/${MODEL} OUTPUT_DIR=$ROOT/Data/chatgpt/output/reward/$MODEL TEST_FILENAME="test_data_external_v1.jsonl" OUTPUT_FILENAME="result_external_v1.tsv" CHECKPOINT="${ROOT}/Data/chatgpt/output/reward/${MODEL}/pytorch_modelstar.bin" #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR python $MAIN \ --local_rank 0 \ --device_map "auto" \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --model_name_or_path $MODEL_PATH \ --tokenizer_path $TOKENIZER_PATH \ --checkpoint $CHECKPOINT \ --max_length 512 \ --logging_steps 50 \ --do_pred \ --test_filename $TEST_FILENAME \ --eval_batch_size 256 \ --output_filename $OUTPUT_FILENAME \ > out/pred_reward_${MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ================================================ FILE: examples/pretrain.sh ================================================ #!/bin/bash MODEL="llama-7B" ROOT="/mnt/sfevol775196/sunzeye273" #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" #ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp" DATR_DIR=$ROOT/Data/chatgpt/processed #MAIN=$ROOT/Code/chatgpt/src/pretrain.py MAIN=$ROOT/Code/RLHF/src/pretrain_wo_trainer.py MODEL_PATH=$ROOT/Data/models/$MODEL #MODEL_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL OUTPUT_DIR=$ROOT/Data/chatgpt/output/pretrain/$MODEL TRAIN_FILENAME="pretrain_data_v1.jsonl" EVAL_FILENAME="pretrain_eval_data_v1.jsonl" #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR #CUDA_VISIBLE_DEVICES=1 deepspeed --master_port 5008 $MAIN \ #python $MAIN \ CUDA_LAUNCH_BLOCKING=1 deepspeed $MAIN \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --model_name_or_path $MODEL_PATH \ --max_length 2048 \ --logging_steps 50 \ --save_steps 1000 \ --metric_for_best_model "eval_rougeLsum" \ --learning_rate 3e-4 \ --do_train \ --train_filename $TRAIN_FILENAME \ --concat_samples \ --num_epochs 2 \ --train_batch_size 8 \ --gradient_accumulation_steps 8 \ --warmup_steps 100 \ --gradient_checkpointing \ --deepspeed_config "stage-3-no_trainer.json" \ --do_eval \ --eval_filename $EVAL_FILENAME \ --eval_batch_size 8 \ --eval_steps 500 \ > out/pretrain_${MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ================================================ FILE: examples/train_dpo.sh ================================================ #!/bin/bash MODEL="chatglm2-6B" #ROOT="/mnt/sfevol775196/sunzeye273" ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp" DATR_DIR=$ROOT/Data/chatgpt/processed #MAIN=$ROOT/Code/chatgpt/src/train_dpo.py MAIN=$ROOT/Code/RLHF/src/train_dpo.py #TOKENIZER_PATH=$ROOT/Data/models/$MODEL TOKENIZER_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL MODEL_PATH=$ROOT/Data/chatgpt/output/sft/$MODEL REFERENCE_MODEL_PATH=$ROOT/Data/chatgpt/output/sft/$MODEL OUTPUT_DIR=$ROOT/Data/chatgpt/output/dpo/$MODEL TRAIN_FILENAME="sft_train_v2.1.jsonl" EVAL_FILENAME="sft_eval_v1.1.jsonl" TEST_FILENAME="${TRAIN_FILENAME},${EVAL_FILENAME}" OUTPUT_FILENAME="dpo_logps_v2.1.bin" #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR if [ -f $OUTPUT_DIR/$OUTPUT_FILENAME ] then echo "${OUTPUT_DIR}/${OUTPUT_FILENAME} already exists, skipping prediction stage" else python $MAIN \ --local_rank 0 \ --device_map "cuda:0" \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --tokenizer_path $TOKENIZER_PATH \ --model_name_or_path $MODEL_PATH \ --max_length 512 \ --logging_steps 10 \ --eval_batch_size 32 \ --do_pred \ --test_filename $TEST_FILENAME \ --output_filename $OUTPUT_FILENAME \ > out/pred_dpo_${MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 fi #CUDA_VISIBLE_DEVICES=1 deepspeed --master_port 5008 $MAIN \ #python $MAIN \ CUDA_LAUNCH_BLOCKING=1 deepspeed $MAIN \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --tokenizer_path $TOKENIZER_PATH \ --model_name_or_path $MODEL_PATH \ --max_length 512 \ --logging_steps 10 \ --save_steps 100 \ --learning_rate 1e-5 \ --do_train \ --train_filename $TRAIN_FILENAME \ --train_batch_size 8 \ --gradient_accumulation_steps 8 \ --num_epochs 5 \ --gradient_checkpointing \ --deepspeed_config "stage-3.json" \ --do_eval \ --eval_filename $EVAL_FILENAME \ --eval_batch_size 32 \ --output_filename $OUTPUT_FILENAME \ > out/train_dpo_${MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ================================================ FILE: examples/train_reward.sh ================================================ #!/bin/bash MODEL="chatglm-6B" ROOT="/mnt/sfevol775196/sunzeye273" #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" #ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp" DATR_DIR=$ROOT/Data/chatgpt/processed #MAIN=$ROOT/Code/chatgpt/src/train_reward.py MAIN=$ROOT/Code/RLHF/src/train_reward.py TOKENIZER_PATH=$ROOT/Data/models/$MODEL #TOKENIZER_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL MODEL_PATH=$ROOT/Data/chatgpt/output/sft/${MODEL} OUTPUT_DIR=$ROOT/Data/chatgpt/output/reward/$MODEL TRAIN_FILENAME="train_data_external_v1.jsonl" EVAL_FILENAME="dev_data_external_v1.jsonl" #CHECKPOINT="${ROOT}/Data/chatgpt/output/sft/${MODEL}/pytorch_modelstar.bin" #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR #python $MAIN \ CUDA_LAUNCH_BLOCKING=1 deepspeed $MAIN \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --model_name_or_path $MODEL_PATH \ --tokenizer_path $TOKENIZER_PATH \ --max_length 512 \ --logging_steps 10 \ --save_steps 100 \ --metric_for_best_model "eval_accuracy" \ --learning_rate 1e-5 \ --do_train \ --train_filename $TRAIN_FILENAME \ --train_batch_size 24 \ --gradient_accumulation_steps 8 \ --num_epochs 5 \ --gradient_checkpointing \ --deepspeed_config "stage-3.json" \ --do_eval \ --eval_filename $EVAL_FILENAME \ --eval_batch_size 24 \ > out/train_reward_${MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ================================================ FILE: examples/train_rlhf.sh ================================================ #!/bin/bash REWARD_MODEL="pangu-350M" SFT_MODEL="pangu-2.6B" ROOT="/mnt/sfevol775196/sunzeye273" #ROOT="/mnt/share-pa002-vol682688-prd/sunzeye273" #ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp/" DATR_DIR=$ROOT/Data/chatgpt/processed #MAIN=$ROOT/Code/chatgpt/src/train_rlhf.py MAIN=$ROOT/Code/RLHF/src/train_rlhf.py #ACCELERATE_CONFIG=$ROOT/Code/RLHF/src/resources/ppo_model/default_accelerate_config.yaml TOKENIZER_PATH=$ROOT/Data/models/$REWARD_MODEL ACTOR_MODEL_PATH=$ROOT/Data/chatgpt/output/sft/$SFT_MODEL CRITIC_MODEL_PATH=$ROOT/Data/models/$REWARD_MODEL #CRITIC_MODEL_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$REWARD_MODEL CRITIC_CHECKPOINT=$ROOT/Data/chatgpt/output/reward/$REWARD_MODEL/pytorch_model.bin OUTPUT_DIR=$ROOT/Data/chatgpt/output/rlhf/$SFT_MODEL TRAIN_FILENAME="train_data_external_v1.jsonl" PRETRAIN_FILENAME="train_data_external_v1.jsonl" #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR #python $MAIN \ #accelerate launch --main_process_port 5007 --config_file $ACCELERATE_CONFIG $MAIN \ CUDA_LAUNCH_BLOCKING=1 deepspeed $MAIN \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --tokenizer_path $TOKENIZER_PATH \ --actor_model_path $ACTOR_MODEL_PATH \ --critic_model_path $CRITIC_MODEL_PATH \ --critic_checkpoint $CRITIC_CHECKPOINT \ --max_length 512 \ --max_gen_length 256 \ --logging_steps 10 \ --do_train \ --train_filename $TRAIN_FILENAME \ --pretrain_filename $PRETRAIN_FILENAME \ --actor_learning_rate 1e-5 \ --critic_learning_rate 1e-5 \ --lr_scheduler_type cosine \ --train_batch_size 4 \ --ppo_train_batch_size 4 \ --gradient_accumulation_steps 16 \ --num_epochs 1 \ --ppo_epochs 1 \ --enable_hybrid_engine \ --actor_zero_stage 3 \ --critic_zero_stage 3 \ --offload_reference_model \ --actor_gradient_checkpointing \ --critic_gradient_checkpointing \ --release_inference_cache \ > out/train_rlhf_${SFT_MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ================================================ FILE: examples/train_sft.sh ================================================ #!/bin/bash MODEL="chatglm2-6B" #ROOT="/mnt/sfevol775196/sunzeye273" ROOT="/mnt/pa002-28359-vol543625-private" #ROOT="/root/autodl-tmp" DATR_DIR=$ROOT/Data/chatgpt/processed #MAIN=$ROOT/Code/chatgpt/src/train_sft.py MAIN=$ROOT/Code/RLHF/src/train_sft.py #TOKENIZER_PATH=$ROOT/Data/models/$MODEL TOKENIZER_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL #MODEL_PATH=$ROOT/Data/models/$MODEL MODEL_PATH=/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint/$MODEL OUTPUT_DIR=$ROOT/Data/chatgpt/output/sft/$MODEL TRAIN_FILENAME="train_data_external_v1.jsonl" EVAL_FILENAME="dev_data_external_v1.jsonl" #cd $ROOT/Code/chatgpt || exit cd $ROOT/Code/RLHF || exit mkdir -p $OUTPUT_DIR #CUDA_VISIBLE_DEVICES=1 deepspeed --master_port 5008 $MAIN \ #python $MAIN \ CUDA_LAUNCH_BLOCKING=1 deepspeed $MAIN \ --data_dir $DATR_DIR \ --output_dir $OUTPUT_DIR \ --tokenizer_path $TOKENIZER_PATH \ --model_name_or_path $MODEL_PATH \ --max_length 512 \ --logging_steps 10 \ --save_steps 100 \ --metric_for_best_model "eval_rougeLsum" \ --learning_rate 1e-5 \ --do_train \ --train_filename $TRAIN_FILENAME \ --concat_samples \ --train_batch_size 8 \ --gradient_accumulation_steps 8 \ --num_epochs 5 \ --gradient_checkpointing \ --deepspeed_config "stage-3.json" \ --do_eval \ --eval_filename $EVAL_FILENAME \ --eval_batch_size 32 \ > out/train_sft_${MODEL}_"`date "+%Y-%m-%d-%H:%M:%S"`".log 2>&1 & ================================================ FILE: notebook/data_prepare.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os, time, re, random, glob, json, jieba, copy\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import torch\n", "from transformers import (\n", " AutoModelForCausalLM,\n", " AutoModelForSeq2SeqLM,\n", " AutoModelForMultipleChoice,\n", " AutoTokenizer,\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator,\n", " TextGenerationPipeline\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "from sys import platform\n", "if platform == \"linux\" or platform == \"linux2\":\n", " # linux\n", " root = \"/mnt/sfevol775196/sunzeye273/Data\"\n", "# root = \"/mnt/share-pa002-vol682688-prd/sunzeye273/Data\"\n", "# root = \"/mnt/pa002-28359-vol543625-private/Data\"\n", "# root = \"/root/autodl-tmp/Data\"\n", "elif platform == \"darwin\":\n", " # OS X\n", " root = \"/Users/zeyesun/Documents/Data\"\n", "elif platform == \"win32\":\n", " # Windows...\n", " root = \"D:\\\\Data\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "CLEAN_TEXT_PATTERN = re.compile(r\"[\\r\\n]\")\n", "\n", "def clean_text(text):\n", " return CLEAN_TEXT_PATTERN.sub(\"\", text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_name = \"pangu-350M\"\n", "# model_name = \"chatglm-6B\"\n", "model_name_or_path = os.path.join(root, \"models\", model_name)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)\n", "print(tokenizer.special_tokens_map)\n", "print(tokenizer.all_special_ids)\n", "print(\n", " f\"unk: {tokenizer.unk_token_id}\\n\",\n", " f\"pad: {tokenizer.pad_token_id}\\n\",\n", " f\"bos: {tokenizer.bos_token_id}\\n\",\n", " f\"eos: {tokenizer.eos_token_id}\\n\",\n", " f\"sep: {tokenizer.sep_token_id}\\n\",\n", " f\"mask: {tokenizer.mask_token_id}\\n\",\n", "# f\"eop: {tokenizer.eop_token_id}\\n\"\n", "# f\"sop: {tokenizer.sop_token_id}\\n\"\n", "# f\"cls: {tokenizer.cls_token_id}\"\n", ") " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Processing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### weibo_summary_comments_json" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t = time.time()\n", "fi = os.path.join(root, \"raw\", \"weibo_summary_comments_json.json\")\n", "fo = os.path.join(root, \"chatgpt\", \"processed\", \"weibo_summary_comments.jsonl\")\n", "ct = 0\n", "with open(fo, \"w\", encoding=\"utf-8\") as w:\n", " with open(fi, \"r\", encoding=\"utf-8\") as r:\n", " while True:\n", " line = r.readline()\n", " if not line\n", " break\n", " \n", " item = json.loads(line.strip(\"\\n\"))\n", " article = item['article'].replace(\" \", \"\")\n", " abstract = item['abstract'].replace(\" \", \"\")\n", " prompt = f\"新闻内容:{article}{tokenizer.sep_token}摘要:{abstract}{tokenizer.sep_token}评论:\"\n", " answers = [\n", " {\n", " \"answer\": k.replace(\" \", \"\"), \n", " \"score\": int(v)\n", " } for (k, v) in sorted(item['comments'], key=lambda x: (int(x[1]), len(x[0])), reverse=True)\n", " ]\n", " w.write(json.dumps({\"prompt\": prompt, \"answers\": answers}, ensure_ascii=False)+'\\n')\n", " ct += 1\n", "print(f\"length: {ct}, time taken: {time.time()-t} s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### couplets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t1 = time.time()\n", "fi = os.path.join(root, \"raw\", \"couplets.txt\")\n", "fo = os.path.join(root, \"chatgpt\", \"processed\", \"couplets.jsonl\")\n", "l2 = []\n", "nexts = dict()\n", "with open(fi, \"r\", encoding=\"utf-8\") as r:\n", " while True:\n", " line = r.readline()\n", " if not line:\n", " break\n", " line = line.strip(\"\\n\")\n", " idx = len(line) // 2\n", " prompt = line[:idx]\n", " answer = line[idx+1:]\n", " answers = [{\"answer\": answer, \"score\": 1}]\n", " l2.append({\"prompt\": f\"上联:{prompt}{tokenizer.sep_token}下联:\", \"answers\": answers})\n", " length = len(answer)\n", " if length not in nexts:\n", " nexts[length] = list()\n", " nexts[length].append(answer)\n", "t2 = time.time()\n", "print(f\"length: {len(l2)}, # different lengths: {len(nexts)}, time taken: {t2-t1} s\")\n", "with open(fo, \"w\", encoding=\"utf-8\") as w:\n", " for i, l in tqdm(enumerate(l2), desc=\"Processing Couplets\"):\n", " answer = l['answers'][0]\n", " length = len(answer['answer'])\n", " # 上下联长度一样\n", " nexts_tmp = set(nexts[length])\n", " nexts_tmp.remove(answer['answer'])\n", " nexts_tmp = set(nexts[length]).difference(set([answer['answer']]))\n", "# nexts_tmp.remove(answer['answer'])\n", " answers.extend([{\"answer\": fa, \"score\": 0} for fa in random.sample(nexts_tmp, 2)])\n", " # 上下联长度不一样\n", " keys = set(nexts.keys())\n", " keys.remove(length)\n", " answers.extend([{\"answer\": random.choice(nexts[key]), \"score\": -1} for key in random.sample(keys, 2)])\n", "# answers = sorted(answers, key=lambda x: x['score'], reverse=True)\n", " w.write(json.dumps({\"prompt\": l['prompt'], \"answers\": answers}, ensure_ascii=False)+'\\n')\n", "# if i % 1000 == 0:\n", "# print(f\"{i} samples processed, time taken: {time.time()-t2} s\")\n", "print(f\"length: {len(l2)}, time taken: {time.time()-t2} s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### zhidao" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t = time.time()\n", "fp = os.path.join(root, \"raw\", \"zhidao\", \"*.csv\")\n", "fo = os.path.join(root, \"chatgpt\", \"processed\", \"zhidao.jsonl\")\n", "ct = 0\n", "with open(fo, \"w\", encoding=\"utf-8\") as w:\n", " for fi in glob.glob(fp):\n", " ct = 0\n", " df = pd.read_csv(fi).sort_values(by=[\"title\", \"is_best\"], ascending=False)\n", " prev_title = None\n", " prev_prompt = None\n", " for _, val in df.iterrows():\n", " if isinstance(val['question'], str) and val['question'] != val['title']:\n", " prompt = f\"问题:{val['title']}{tokenizer.sep_token}内容:{val['question']}{tokenizer.sep_token}回答:\"\n", " else:\n", " prompt = f\"问题:{val['title']}{tokenizer.sep_token}回答:\"\n", " if prev_title is not None and prev_title == val['title']:\n", " answers.append({\"answer\": val['reply'], \"score\": val['is_best']})\n", " else:\n", " if prev_title is not None:\n", "# l3.append({\"prompt\": prev_prompt, \"answers\": copy.deepcopy(answers)})\n", " w.write(json.dumps({\"prompt\": prev_prompt, \"answers\": answers}, ensure_ascii=False)+'\\n')\n", " answers = [{\"answer\": val['reply'], \"score\": val['is_best']}]\n", " prev_prompt = prompt\n", " prev_title = val['title']\n", " ct += 1\n", "# l3.append({\"prompt\": prev_prompt, \"answers\": copy.deepcopy(answers)})\n", " w.write(json.dumps({\"prompt\": prev_prompt, \"answers\": answers}, ensure_ascii=False)+'\\n')\n", " print(f\"finished processing {os.path.basename(fi)}\")\n", "print(f\"length: {ct}, time taken: {time.time()-t} s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### JDData" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from html.parser import HTMLParser\n", "class MyHTMLParser(HTMLParser):\n", " def __init__(self):\n", " super().__init__()\n", " #Initializing lists\n", " self.start_tags = list()\n", " self.end_tags = list()\n", " self.start_end_tags = list()\n", " self.data_list = list()\n", " #HTML Parser Methods\n", " def handle_starttag(self, startTag, attrs):\n", " self.start_tags.append(startTag)\n", " def handle_endtag(self, endTag):\n", " self.end_tags.append(endTag)\n", " def handle_startendtag(self,startendTag, attrs):\n", " self.start_end_tags.append(startendTag)\n", " def handle_data(self, data):\n", " self.data_list.append(data)\n", " \n", "t = time.time()\n", "fi = os.path.join(root, \"raw\", \"JDData\", \"*.data*\")\n", "# fo = os.path.join(root, \"chatgpt\", \"processed\", \"zhidao.jsonl\")\n", "ct = 0\n", "with open(fo, \"w\", encoding=\"utf-8\") as w:\n", " for fi in glob.glob(fp):\n", " ct = 0\n", " with open(fi, \"r\", encoding=\"gbk\") as r:\n", " line = r.readline()\n", " items = line.strip(\"\\n\").split(\"\\t\")\n", " parser = MyHTMLParser()\n", " parser.feed(items[1])\n", " for t, d in zip(parser.start_tags, parser.data_list):\n", " print(f\"{t}: {d}\")\n", "# prompt = f\"问题:{val['title']}{tokenizer.sep_token}内容:{val['question']}{tokenizer.sep_token}回答:\"\n", "# answers.append({\"answer\": val['reply'], \"score\": val['is_best']})\n", " ct += 1\n", "# l3.append({\"prompt\": prev_prompt, \"answers\": copy.deepcopy(answers)})\n", "# w.write(json.dumps({\"prompt\": prev_prompt, \"answers\": answers}, ensure_ascii=False)+'\\n')\n", " print(f\"finished processing {os.path.basename(fi)}\")\n", "print(f\"length: {ct}, time taken: {time.time()-t} s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### yf_amazon" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t = time.time()\n", "fi = os.path.join(root, \"raw\", \"yf_amazon\", \"products.csv\")\n", "dfp = pd.read_csv(fi)\n", "fi = os.path.join(root, \"raw\", \"yf_amazon\", \"ratings.csv\")\n", "dfr = pd.read_csv(fi)\n", "fi = os.path.join(root, \"raw\", \"yf_amazon\", \"categories.csv\")\n", "dfc = pd.read_csv(fi)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfp.columns\n", "# dfp['name'].unique().tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfp['cate_id_1'] = dfp['catIds'].apply(lambda x: x.split(\",\")[0])\n", "for cid1 in dfp['cate_id_1'].unique():\n", " print(dfc[dfc['catId']==int(cid1)]['category'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### dmsc" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t = time.time()\n", "fi = os.path.join(root, \"raw\", \"dmsc\", \"movies.csv\")\n", "dfm = pd.read_csv(fi)\n", "print(dfm.shape)\n", "fi = os.path.join(root, \"raw\", \"dmsc\", \"ratings.csv\")\n", "dfr = pd.read_csv(fi)\n", "print(dfr.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfr.groupby(\"movieId\", 'rating').count()['comment']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Chinese Classical-Modern" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t1 = time.time()\n", "fp = os.path.join(root, \"raw\", \"Classical-Modern\", \"bitext\", \"*\")\n", "fo = os.path.join(root, \"chatgpt\", \"processed\", \"chinese_classical.jsonl\")\n", "l3 = []\n", "dicts = dict()\n", "for fi in glob.glob(fp):\n", " name = os.path.basename(fi)\n", " dicts[name] = {\"古文\": [], \"现代文\": []}\n", " with open(fi, \"r\", encoding=\"utf-8\") as r:\n", " for i, line in enumerate(r):\n", " line = line.strip(\"\\n\")\n", " if line.startswith(\"古文\"):\n", " p1 = line[3:]\n", " dicts[name]['古文'].append(p1)\n", " elif line.startswith(\"现代文\"):\n", " p2 = line[4:]\n", " dicts[name]['现代文'].append(p2)\n", " elif p1 is not None and p2 is not None:\n", " pair = [(\"古文\", p1), (\"现代文\", p2)]\n", " random.shuffle(pair)\n", " prompt = f\"{pair[0][0]}:{pair[0][1]}{tokenizer.sep_token}{pair[1][0]}:\"\n", " answers = [{\"answer\": pair[1][1], \"score\": 1}]\n", " l3.append({\"prompt\": prompt, \"answers\": answers, \"name\": name})\n", " p1 = None\n", " p2 = None\n", "t2 = time.time()\n", "print(f\"length: {len(l3)}, # different names: {len(dicts)}, time taken: {t2-t1} s\")\n", "with open(fo, \"w\", encoding=\"utf-8\") as w:\n", " for i, l in tqdm(enumerate(l3), desc=\"Processing Chinese Classical-Modern\"):\n", " name = l['name']\n", " prompt = l['prompt']\n", " answer = l['answers'][0]['answer']\n", " if prompt.startswith(\"古文\"):\n", " answer_type = '现代文'\n", " else:\n", " answer_type = '古文'\n", " samples_tmp = set(dicts[name][answer_type])\n", " samples_tmp.remove(answer)\n", " answers.extend([{\"answer\": fa, \"score\": 0} for fa in random.sample(samples_tmp, 2)])\n", " keys = set(dicts.keys())\n", " keys.remove(name)\n", " answers.extend([{\"answer\": random.choice(dicts[key][answer_type]), \"score\": -1} for key in random.sample(keys, 2)])\n", " w.write(json.dumps({\"prompt\": prompt, \"answers\": answers}, ensure_ascii=False)+'\\n')\n", "# if i % 100 == 0:\n", "# print(f\"{i} samples processed, time taken: {time.time()-t2} s\")\n", "print(f\"length: {i}, time taken: {time.time()-t} s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Chinese Poetry" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import opencc\n", "converter = opencc.OpenCC('t2s.json')\n", "t1 = time.time()\n", "fp = [\n", " # 四书五经\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"lunyu\", \"lunyu.json\"),\n", "# os.path.join(root, \"raw\", \"chinese-poetry\", \"mengxue\", \"*.json\"),\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"sishuwujing\", \"*.json\"),\n", " # 古体诗\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"caocaoshiji\", \"caocao.json\"),\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"shijing\", \"shijing.json\"),\n", " # 楚辞\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"chuci\", \"chuci.json\"),\n", " # 诗\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"shi\", \"poet*.json\"),\n", " # 词\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"ci\", \"ci*.json\"),\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"nalanxingde\", \"*.json\"),\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"wudai\", \"huajianji\", \"*juan.json\"),\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"wudai\", \"nantang\", \"poetrys.json\"),\n", " # 曲\n", " os.path.join(root, \"raw\", \"chinese-poetry\", \"yuanqu\", \"yuanqu.json\"),\n", "]\n", "fs = [each for f in fp for each in glob.glob(f)]\n", "\n", "l5 = []\n", "dicts = dict()\n", "for fi in fs:\n", " lines = json.load(open(fi, \"r\", encoding=\"utf-8\"))\n", " if isinstance(lines, dict):\n", " lines = [lines]\n", " for i, line in enumerate(lines):\n", " if \"lunyu\" in fi:\n", " author = \"孔子\"\n", " genre = \"经书\"\n", " title = line['chapter']\n", " contents = \"\".join(line['paragraphs'])\n", " elif \"daxue\" in fi:\n", " author = \"曾子\"\n", " genre = \"经书\"\n", " title = \"大学\"\n", " contents = converter.convert(\"\".join(line['paragraphs'])).replace(\"「\", \"“\").replace(\"」\", \"”\")\n", " elif \"mengzi\" in fi:\n", " author = \"孟子\"\n", " genre = \"经书\"\n", " title = converter.convert(line['chapter'])\n", " contents = converter.convert(\"\".join(line['paragraphs'])).replace(\"「\", \"“\").replace(\"」\", \"”\")\n", " elif \"zhongyong\" in fi:\n", " author = \"孔伋\"\n", " genre = \"经书\"\n", " title = \"中庸\"\n", " contents = converter.convert(\"\".join(line['paragraphs'])).replace(\"「\", \"“\").replace(\"」\", \"”\")\n", " elif \"caocao\" in fi:\n", " author = \"曹操\"\n", " genre = \"古体诗\"\n", " title = line['title']\n", " contents = \"\".join(line['paragraphs'])\n", " elif \"shijing\" in fi:\n", " author = \"诗经\"\n", " genre = \"古体诗\"\n", " title = line['chapter'] + \"-\" + line['section'] + \"-\" + line['title']\n", " contents = \"\".join(line['content'])\n", " elif \"chuci\" in fi:\n", " author = line['author']\n", " genre = \"楚辞\"\n", " title = line['section'] + \"-\" + line['title']\n", " contents = \"\".join(line['content'])\n", " elif \"nalanxingde\" in fi:\n", " author = line['author']\n", " genre = \"词\"\n", " title = line['title']\n", " contents = \"\".join(line['para'])\n", " elif \"huajianci\" in fi:\n", " author = line['author']\n", " genre = \"词\"\n", " title = line['title']\n", " contents = \"\".join(line['paragraphs'])\n", " elif \"nantang\" in fi:\n", " author = line['author']\n", " genre = \"词\"\n", " title = line['title']\n", " contents = \"\".join(line['paragraphs'])\n", " elif \"yuanqu\" in fi:\n", " author = line['author']\n", " genre = \"曲\"\n", " title = line['title']\n", " contents = \"\".join(line['paragraphs'])\n", " elif \"shi\" in fi:\n", " if len(line['paragraphs']) <= 0:\n", " continue\n", " author = converter.convert(line['author'])\n", " genre = \"五言诗\" if len(line['paragraphs'][0]) == 12 else \"七言诗\"\n", " title = converter.convert(line['title'])\n", " contents = converter.convert(\"\".join(line['paragraphs']))\n", " elif \"ci\" in fi:\n", " author = line['author']\n", " genre = \"词\"\n", " title = line['rhythmic']\n", " contents = \"\".join(line['paragraphs'])\n", " if genre not in dicts:\n", " dicts[genre] = dict()\n", " if author not in dicts[genre]:\n", " dicts[genre][author] = dict()\n", " quantifier = \"篇\" if genre in [\"经书\", \"楚辞\"] else \"首\"\n", " prompt = f\"以{author}的风格,写一{quantifier}{genre},题为{title}{tokenizer.sep_token}\"\n", " answers = [{\"answer\": contents, \"score\": 1}]\n", " l5.append({\"prompt\": prompt, \"answers\": answers, \"genre\": genre, \"title\": title, \"author\": author})\n", " dicts[genre][author][title] = contents\n", " \n", "t2 = time.time()\n", "print(f\"length: {len(l5)}, # different lengths: {len(dicts)}, time taken: {t2-t1} s\")\n", "fo = os.path.join(root, \"chatgpt\", \"processed\", \"chinese_poetry.jsonl\")\n", "with open(fo, \"w\", encoding=\"utf-8\") as w:\n", " for i, l in tqdm(enumerate(l5), desc=\"Processing Chinese Poetry\"):\n", " genre = l['genre']\n", " author = l['author']\n", " title = l['title']\n", " prompt = l['prompt']\n", " answers = l['answers']\n", " # 同作者其他作品-2\n", " titles_tmp = set(dicts[genre][author].keys())\n", " titles_tmp.remove(title)\n", " if len(titles_tmp) > 0:\n", " t = random.choice(list(titles_tmp))\n", " answers.append({\"answer\": dicts[genre][author][t], \"score\": 0})\n", " # 同体裁其他作者其他作品-1\n", " authors_tmp = set(dicts[genre].keys())\n", " authors_tmp.remove(author)\n", " a = random.choice(list(authors_tmp))\n", " t = random.choice(list(dicts[genre][a].keys()))\n", " answers.append({\"answer\": dicts[genre][a][t], \"score\": -1})\n", " # 不同体裁作品-0\n", " genres_tmp = set(dicts.keys())\n", " genres_tmp.remove(genre)\n", " g = random.choice(list(genres_tmp))\n", " a = random.choice(list(dicts[g].keys()))\n", " t = random.choice(list(dicts[g][a].keys()))\n", " answers.append({\"answer\": dicts[g][a][t], \"score\": -2})\n", " w.write(json.dumps({\"prompt\": prompt, \"answers\": answers}, ensure_ascii=False)+'\\n')\n", "print(f\"length: {i}, time taken: {time.time()-t2} s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### baike_qa_2019" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fs = glob.glob(os.path.join(root, \"raw\", \"baike_qa2019\", \"baike_qa_*.json\"))\n", "fo = os.path.join(root, \"chatgpt\", \"processed\", \"baike_qa.jsonl\")\n", "ct = 0\n", "# items = []\n", "# lens_prompt = []\n", "# lens_label = []\n", "with open(fo, \"w\", encoding=\"utf-8\") as w:\n", " for f in fs:\n", " with open(f, \"r\", encoding=\"utf-8\") as r:\n", " while True:\n", " line = r.readline()\n", " if not line:\n", " break\n", " item = json.loads(line.strip(\"\\n\"))\n", " question = clean_text(item['title'] if len(item['title']) > len(item['desc']) else item['desc'])\n", " prompt = f\"{question}{tokenizer.sep_token}回答:\"\n", " answer = clean_text(item['answer'])\n", " answers = [{\"answer\": answer, \"score\": 1}]\n", "# items.append(item)\n", "# lens_prompt.append(len(prompt))\n", "# lens_label.append(len(label))\n", " w.write(json.dumps({\"prompt\": prompt, \"answers\": answers}, ensure_ascii=False)+'\\n')\n", " ct += 1\n", "print(ct)\n", "# print(len(items))\n", "# print(np.percentile(lens_prompt, np.arange(90, 101)))\n", "# print(np.percentile(lens_label, np.arange(90, 101)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### rm-static" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fi = os.path.join(root, \"raw\", \"rm-static\", \"data\", \"test-00000-of-00001-bf4c733542e35fcb.parquet\")\n", "df = pd.read_parquet(fi)\n", "print(df.shape)\n", "df.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ChatGLM" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModel\n", "model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True).half().cuda()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "texts = [\n", " \"你好\",\n", " \"晚上睡不着应该怎么办\"\n", "]\n", "history = []\n", "for text in texts:\n", " response, history = model.chat(tokenizer, text, history=history)\n", " print(f\"问: {text}\\n答:{response}\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: notebook/reward.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "a7d1682d", "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(0, \"/root/autodl-tmp/Code/RLHF\")\n", "sys.path.insert(0, \"/Users/zeyesun/Documents/Code/RLHF\")\n", "sys.path.insert(0, \"D:\\\\Code\\\\RLHF\")\n", "sys.path.insert(0, \"/mnt/sfevol775196/sunzeye273/Code/chatgpt\")\n", "sys.path.insert(0, \"/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt\")\n", "sys.path.insert(0, \"/mnt/pa002-28359-vol543625-private/Code/chatgpt\")\n", "\n", "import os, time, re, random, glob, json, jieba, copy\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import torch\n", "from transformers import (\n", " AutoModelForCausalLM,\n", " AutoModelForSeq2SeqLM,\n", " AutoModelForMultipleChoice,\n", " AutoTokenizer,\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator,\n", " TextGenerationPipeline\n", ")\n", "\n", "from src.models.reward import RewardModel\n", "\n", "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "from sys import platform\n", "if platform == \"linux\" or platform == \"linux2\":\n", " # linux\n", " root = \"/mnt/sfevol775196/sunzeye273/Data\"\n", "# root = \"/mnt/share-pa002-vol682688-prd/sunzeye273/Data\"\n", "# root = \"/mnt/pa002-28359-vol543625-private/Data\"\n", "# root = \"/root/autodl-tmp/Data\"\n", "elif platform == \"darwin\":\n", " # OS X\n", " root = \"/Users/zeyesun/Documents/Data\"\n", "elif platform == \"win32\":\n", " # Windows...\n", " root = \"D:\\\\Data\"" ] }, { "cell_type": "code", "execution_count": null, "id": "a67e7e4e", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoConfig\n", "model_name = \"pangu-small\"\n", "model_name_or_path = os.path.join(root, \"models\", model_name)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)\n", "config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)\n", "model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)\n", "f = os.path.join(model_name_or_path, \"pytorch_model.bin\")\n", "torch.save(model.state_dict(), f)\n", "\n", "model.config.lora_rank = 0\n", "reward_model = RewardModel(model.config, model.transformer, tokenizer)\n", "reward_checkpoint = os.path.join(root, \"chatgpt\", \"output\", \"reward\", model_name, \"pytorch_model.bin\")\n", "torch.save(reward_model.state_dict(), reward_checkpoint)" ] }, { "cell_type": "code", "execution_count": null, "id": "078a3631", "metadata": {}, "outputs": [], "source": [ "# model_name = \"pangu-350M\"\n", "model_name = \"pangu-small\"\n", "# model_name = \"chatglm-6B\"\n", "model_name_or_path = os.path.join(root, \"models\", model_name)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)\n", "print(tokenizer.special_tokens_map)\n", "print(tokenizer.all_special_ids)\n", "print(\n", " f\"unk: {tokenizer.unk_token_id}\\n\",\n", " f\"pad: {tokenizer.pad_token_id}\\n\",\n", " f\"bos: {tokenizer.bos_token_id}\\n\",\n", " f\"eos: {tokenizer.eos_token_id}\\n\",\n", " f\"sep: {tokenizer.sep_token_id}\\n\",\n", " f\"mask: {tokenizer.mask_token_id}\\n\",\n", "# f\"eop: {tokenizer.eop_token_id}\\n\"\n", "# f\"sop: {tokenizer.sop_token_id}\\n\"\n", "# f\"cls: {tokenizer.cls_token_id}\"\n", ") " ] }, { "cell_type": "code", "execution_count": null, "id": "417b6ada", "metadata": {}, "outputs": [], "source": [ "lora_rank = 0\n", "lora_alpha = 1\n", "lora_train_bias = \"none\"" ] }, { "cell_type": "code", "execution_count": null, "id": "753d0550", "metadata": {}, "outputs": [], "source": [ "if \"pangu\" in model_name_or_path:\n", " model = AutoModelForCausalLM.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)\n", " model.resize_token_embeddings(tokenizer.vocab_size)\n", " model.config.lora_rank = lora_rank\n", " model.config.lora_alpha = lora_alpha\n", " model.config.lora_train_bias = lora_train_bias\n", " # Initialize the reward model from the (supervised) fine-tuned SFT model\n", " reward_model = RewardModel(model.config, model.transformer, tokenizer)\n", "elif \"chatglm\" in model_name_or_path:\n", " model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True).half()\n", " model.config.lora_rank = lora_rank\n", " model.config.lora_alpha = lora_alpha\n", " model.config.lora_train_bias = lora_train_bias\n", " # Initialize the reward model from the (supervised) fine-tuned SFT model\n", " reward_model = RewardModel(model.config, model.transformer, tokenizer).half()\n", " # reward_model = RewardModelWithLoRA(model.config, model.glm, tokenizer)\n", "# layers = reward_model.transformer.layers\n", "elif \"glm\" in model_name_or_path:\n", " model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)\n", " model.config.lora_rank = lora_rank\n", " model.config.lora_alpha = lora_alpha\n", " model.config.lora_train_bias = lora_train_bias\n", " # Initialize the reward model from the (supervised) fine-tuned SFT model\n", " reward_model = RewardModel(model.config, model.glm, tokenizer)\n", " # reward_model = RewardModelWithLoRA(model.config, model.glm, tokenizer)\n", "# layers = reward_model.transformer.transformer.layers\n", "else:\n", " raise ValueError(f\"Unsupported model name: {model_name_or_path}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3b18dc5e", "metadata": {}, "outputs": [], "source": [ "# reward_checkpoint = os.path.join(root, \"chatgpt\", \"output\", \"reward\", model_name, \"checkpoint-200549\", \"pytorch_model*.bin\")\n", "reward_checkpoint = os.path.join(root, \"chatgpt\", \"output\", \"reward\", model_name, \"pytorch_model*.bin\")\n", "checkpoints = glob.glob(reward_checkpoint)\n", "st = dict()\n", "for checkpoint in checkpoints:\n", " st.update(torch.load(checkpoint, map_location=\"cpu\"))\n", "reward_model.load_state_dict(st)" ] }, { "cell_type": "code", "execution_count": null, "id": "800f59c3", "metadata": {}, "outputs": [], "source": [ "# reward_model = reward_model.half().to(device)\n", "reward_model = reward_model.to(device)" ] }, { "cell_type": "markdown", "id": "a68f86ad", "metadata": {}, "source": [ "# Reward Model Train" ] }, { "cell_type": "code", "execution_count": null, "id": "58669674", "metadata": {}, "outputs": [], "source": [ "from src.data.data import SFTDataset, PairwiseDataset\n", "from torch.utils.data import DataLoader, SequentialSampler\n", "class dotdict(dict):\n", " __getattr__ = dict.get\n", " __setattr__ = dict.__setitem__\n", " __delattr__ = dict.__delitem__\n", "\n", "max_length = 128\n", "batch_size = 2\n", "args = dotdict({\"model_name_or_path\": model_name_or_path, \"max_length\": max_length})\n", "train_filename = os.path.join(root, \"chatgpt\", \"processed\", \"dev_data_external_v1.jsonl\")\n", "train_dataset = PairwiseDataset(args, train_filename, tokenizer)\n", "sampler = SequentialSampler(train_dataset)\n", "train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)" ] }, { "cell_type": "code", "execution_count": null, "id": "bdab3a67", "metadata": { "scrolled": false }, "outputs": [], "source": [ "reward_model.train()\n", "for batch in train_loader:\n", " print(batch.keys())\n", " chosen_input_ids = batch['chosen_input_ids'].to(device)\n", " chosen_attention_mask = batch['chosen_attention_mask'].to(device) if 'chosen_attention_mask' in batch else None\n", " chosen_position_ids = batch['chosen_position_ids'].to(device) if 'chosen_position_ids' in batch else None\n", " rejected_input_ids = batch['rejected_input_ids'].to(device)\n", " rejected_attention_mask = batch['rejected_attention_mask'].to(device) if 'rejected_attention_mask' in batch else None\n", " rejected_position_ids = batch['rejected_position_ids'].to(device) if 'rejected_position_ids' in batch else None\n", " \n", " chosen_values, chosen_reward = reward_model.reward(chosen_input_ids, chosen_attention_mask, chosen_position_ids)\n", " reject_values, reject_reward = reward_model.reward(rejected_input_ids, rejected_attention_mask, rejected_position_ids)\n", " break" ] }, { "cell_type": "code", "execution_count": null, "id": "44da49d2", "metadata": {}, "outputs": [], "source": [ "chosen_end_scores = []\n", "rejected_end_scores = []\n", "bs = chosen_input_ids.shape[0]\n", "if len(chosen_input_ids.shape) == 3:\n", " chosen_input_ids = chosen_input_ids.squeeze(1)\n", "if len(rejected_input_ids.shape) == 3:\n", " rejected_input_ids = rejected_input_ids.squeeze(1)\n", "\n", "loss = 0\n", "inference = False\n", "for i in range(bs):\n", " if torch.all(torch.eq(chosen_input_ids[i], rejected_input_ids[i])).item():\n", " c_inds = (chosen_input_ids[i] == tokenizer.pad_token_id).nonzero()\n", " c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen_input_ids.shape[1]\n", " chosen_end_scores.append(chosen_values[i, c_ind - 1])\n", " inference = True\n", " continue\n", " print(f\"inference: {inference}\")\n", "\n", " # Check if there is any padding otherwise take length of sequence\n", " c_inds = (chosen_input_ids[i] == tokenizer.pad_token_id).nonzero()\n", " c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen_input_ids.shape[1]\n", " r_inds = (rejected_input_ids[i] == tokenizer.pad_token_id).nonzero()\n", " r_ind = r_inds[0].item() if len(r_inds) > 0 else rejected_input_ids.shape[1]\n", " end_ind = max(c_ind, r_ind)\n", "\n", " # Retrieve first index where trajectories diverge\n", " divergence_ind = (chosen_input_ids[i] != rejected_input_ids[i]).nonzero()[0]\n", " assert divergence_ind > 0\n", "\n", " # Index into the correct rewards\n", " c_truncated_reward = chosen_values[i][divergence_ind:end_ind]\n", " r_truncated_reward = reject_values[i][divergence_ind:end_ind]\n", "\n", " # Append the last rewards to the list of end scores\n", " chosen_end_scores.append(c_truncated_reward[-1])\n", " rejected_end_scores.append(r_truncated_reward[-1])\n", "\n", " # Compute loss\n", " loss += -torch.log(torch.sigmoid(c_truncated_reward - r_truncated_reward)).mean()\n", "\n", " loss = loss / bs\n", "chosen_end_scores = torch.stack(chosen_end_scores)" ] }, { "cell_type": "code", "execution_count": null, "id": "52095bab", "metadata": {}, "outputs": [], "source": [ "loss" ] }, { "cell_type": "markdown", "id": "6b02e60d", "metadata": {}, "source": [ "# Reward Model Eval" ] }, { "cell_type": "code", "execution_count": null, "id": "70abf467", "metadata": {}, "outputs": [], "source": [ "reward_model.eval()" ] }, { "cell_type": "code", "execution_count": null, "id": "9f8b4d40", "metadata": {}, "outputs": [], "source": [ "max_length = 512\n", "tokenizer.padding_size = \"right\"\n", "# tokenizer.padding_size = \"left\"\n", "\n", "prompt = \"现代文:行三十五里,进入登封县境的耿店。\"\n", "prefix = \"古文:\"\n", "pred = \"\"\n", "\n", "encodings_dict = tokenizer(prompt, prefix+pred, max_length=max_length,\n", " truncation=\"longest_first\", padding=\"max_length\", return_tensors=\"pt\",\n", " return_token_type_ids=False, padding_side=\"left\")\n", "print(encodings_dict.keys())\n", "print(encodings_dict['input_ids'].shape)\n", "print(encodings_dict['attention_mask'].shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "7d88344b", "metadata": {}, "outputs": [], "source": [ "input_ids = encodings_dict['input_ids'].to(device)\n", "attention_mask = encodings_dict['attention_mask'].to(device)\n", "res = reward_model(input_ids, attention_mask)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: notebook/rlhf.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "e0fa8199", "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(0, \"/root/autodl-tmp/Code/RLHF\")\n", "sys.path.insert(0, \"/Users/zeyesun/Documents/Code/RLHF\")\n", "sys.path.insert(0, \"D:\\\\Code\\\\RLHF\")\n", "sys.path.insert(0, \"/mnt/sfevol775196/sunzeye273/Code/chatgpt\")\n", "sys.path.insert(0, \"/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt\")\n", "sys.path.insert(0, \"/mnt/pa002-28359-vol543625-private/Code/chatgpt\")\n", "\n", "import os, time, re, random, glob, json, jieba, copy\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import torch\n", "from transformers import (\n", " AutoModelForCausalLM,\n", " AutoModelForSeq2SeqLM,\n", " AutoModelForMultipleChoice,\n", " AutoTokenizer,\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator,\n", " TextGenerationPipeline\n", ")\n", "\n", "from src.models.reward import RewardModel\n", "\n", "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "from sys import platform\n", "if platform == \"linux\" or platform == \"linux2\":\n", " # linux\n", " root = \"/mnt/sfevol775196/sunzeye273/Data\"\n", "# root = \"/mnt/share-pa002-vol682688-prd/sunzeye273/Data\"\n", "# root = \"/mnt/pa002-28359-vol543625-private/Data\"\n", "# root = \"/root/autodl-tmp/Data\"\n", "elif platform == \"darwin\":\n", " # OS X\n", " root = \"/Users/zeyesun/Documents/Data\"\n", "elif platform == \"win32\":\n", " # Windows...\n", " root = \"D:\\\\Data\"" ] }, { "cell_type": "code", "execution_count": null, "id": "354bbf38", "metadata": {}, "outputs": [], "source": [ "# model_name = \"pangu-small\"\n", "# model_name = \"pangu-350M\"\n", "model_name = \"glm-small\"\n", "# model_name = \"chatglm-6B\"\n", "model_name_or_path = os.path.join(root, \"models\", model_name)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)\n", "print(tokenizer.special_tokens_map)\n", "print(tokenizer.all_special_ids)\n", "print(\n", " f\"unk: {tokenizer.unk_token_id}\\n\",\n", " f\"pad: {tokenizer.pad_token_id}\\n\",\n", " f\"bos: {tokenizer.bos_token_id}\\n\",\n", " f\"eos: {tokenizer.eos_token_id}\\n\",\n", " f\"sep: {tokenizer.sep_token_id}\\n\",\n", " f\"mask: {tokenizer.mask_token_id}\\n\",\n", " f\"eop: {tokenizer.eop_token_id}\\n\"\n", "# f\"sop: {tokenizer.sop_token_id}\\n\"\n", " f\"cls: {tokenizer.cls_token_id}\"\n", ") " ] }, { "cell_type": "code", "execution_count": null, "id": "b2de5b11", "metadata": {}, "outputs": [], "source": [ "if \"glm\" in model_name_or_path:\n", " model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)\n", " if \"chatglm\" in model_name_or_path:\n", " model = model.half()\n", "else:\n", " model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, use_cache=False)\n", " model.resize_token_embeddings(tokenizer.vocab_size)\n", "if lora_rank > 0:\n", " convert_to_lora_recursively(model, lora_rank, lora_alpha)\n", " lora.mark_only_lora_as_trainable(model, lora_train_bias)\n", "model = model.to(device)\n", "model.eval()\n", "print(model.device)" ] }, { "cell_type": "code", "execution_count": null, "id": "904753d8", "metadata": {}, "outputs": [], "source": [ "prompt = \"你好,你是谁?\"\n", "prefix = \"答:\"\n", "label = \"我是ChatGPT\"\n", "max_length = 32\n", "max_gen_length = 16\n", "max_prompt_length = max_length - max_gen_length\n", "lora_rank = 0" ] }, { "cell_type": "markdown", "id": "a1484a82", "metadata": {}, "source": [ "## Dataset Debug" ] }, { "cell_type": "code", "execution_count": null, "id": "469acb98", "metadata": {}, "outputs": [], "source": [ "from src.data.data import SFTDataset\n", "from torch.utils.data import RandomSampler, DataLoader\n", "class dotdict(dict):\n", " \"\"\"dot.notation access to dictionary attributes\"\"\"\n", " __getattr__ = dict.get\n", " __setattr__ = dict.__setitem__\n", " __delattr__ = dict.__delitem__\n", "\n", "args = {'model_name_or_path': model_name_or_path,\n", " \"max_length\": 128}\n", "args = dotdict(args)\n", "\n", "train_dataset = SFTDataset(args, \"/Users/zeyesun/Documents/Data/chatgpt/processed/test_data_external_v1.jsonl\", \n", " tokenizer)\n", "train_sampler = RandomSampler(train_dataset)\n", "train_dataloader = DataLoader(\n", " train_dataset,\n", " sampler=train_sampler,\n", " batch_size=4)" ] }, { "cell_type": "markdown", "id": "14b621de", "metadata": {}, "source": [ "## Generation Debug" ] }, { "cell_type": "code", "execution_count": null, "id": "1a9d87b5", "metadata": {}, "outputs": [], "source": [ "tokenizer.padding_side = \"left\"\n", "inputs = tokenizer(prompt, \n", "# label, \n", "# tokenizer.sep_token + prefix,\n", " max_length=max_prompt_length,\n", " padding=\"max_length\",\n", " truncation=\"longest_first\", \n", "# add_special_tokens=False,\n", " return_tensors=\"pt\", \n", " return_token_type_ids=False)\n", "print(inputs)" ] }, { "cell_type": "code", "execution_count": null, "id": "ce652f55", "metadata": {}, "outputs": [], "source": [ "batch_size, prompt_length = inputs['input_ids'].shape\n", "\n", "with torch.no_grad():\n", "# logger.debug(f\"[_generate_sequence] inputs: {inputs}\")\n", " inputs = {k: v.to(device) for k, v in inputs.items()}\n", " seq = model.generate(**inputs, \n", " max_new_tokens=max_gen_length,\n", " pad_token_id=tokenizer.pad_token_id,\n", " do_sample=False,\n", " num_return_sequences=1,\n", " top_p=0.9,\n", " temperature=1.0\n", " )\n", "print(f\"seq: {seq}\")\n", "print(tokenizer.batch_decode(seq))" ] }, { "cell_type": "code", "execution_count": null, "id": "018d2c5c", "metadata": {}, "outputs": [], "source": [ "prompts = []\n", "for i in range(batch_size):\n", " prompt_ids = seq[i, :prompt_length]\n", " prompt_start_index = (prompt_ids != tokenizer.pad_token_id).nonzero()[0].item()\n", " prompt_ids = seq[i, prompt_start_index:prompt_length]\n", " answer_ids = seq[i, prompt_length:]\n", " prompt = tokenizer.decode(prompt_ids, skip_special_tokens=False)\n", " answer = tokenizer.decode(answer_ids, skip_special_tokens=False)\n", " prompts.append(prompt + answer)\n", "print(prompts)\n", "outputs = tokenizer(prompts, max_length=max_length,\n", " truncation=\"longest_first\", padding=\"max_length\",\n", " return_tensors=\"pt\", return_token_type_ids=False)\n", "print(outputs)\n", "print(tokenizer.batch_decode(outputs['input_ids']))" ] }, { "cell_type": "code", "execution_count": null, "id": "429917ee", "metadata": {}, "outputs": [], "source": [ "outputs['input_ids'].device" ] }, { "cell_type": "markdown", "id": "22ca3132", "metadata": {}, "source": [ "## ChatGLM attention mask and position ids Debug" ] }, { "cell_type": "code", "execution_count": null, "id": "d3308b94", "metadata": {}, "outputs": [], "source": [ "# chatglm build attention mask\n", "input_ids = inputs['input_ids']\n", "batch_size, seq_length = input_ids.shape\n", "context_lengths = [seq.tolist().index(tokenizer.bos_token_id) for seq in input_ids]\n", "attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)\n", "print(attention_mask.shape)\n", "attention_mask.tril_()\n", "for i, context_length in enumerate(context_lengths):\n", " attention_mask[i, :, :context_length] = 1\n", "print(attention_mask.shape)\n", "attention_mask.unsqueeze_(1)\n", "print(attention_mask.shape)\n", "# attention_mask = (attention_mask < 0.5).bool()" ] }, { "cell_type": "code", "execution_count": null, "id": "8c4fd4e6", "metadata": {}, "outputs": [], "source": [ "# chatglm bulid position ids\n", "batch_size, seq_length = input_ids.shape\n", "context_lengths = [seq.tolist().index(tokenizer.bos_token_id) for seq in input_ids]\n", "# if self.position_encoding_2d:\n", "position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length)\n", "# if not gmask:\n", "# for i, context_length in enumerate(context_lengths):\n", "# position_ids[i, context_length:] = mask_positions[i]\n", "block_position_ids = [torch.cat((\n", " torch.zeros(context_length, dtype=torch.long, device=device),\n", " torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1\n", ")) for context_length in context_lengths]\n", "block_position_ids = torch.stack(block_position_ids, dim=0)\n", "position_ids = torch.stack((position_ids, block_position_ids), dim=1)\n", "# else:\n", "# position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length)\n", "# if not gmask:\n", "# for i, context_length in enumerate(context_lengths):\n", "# position_ids[context_length:] = mask_positions[i]" ] }, { "cell_type": "markdown", "id": "b6b29e22", "metadata": {}, "source": [ "## GLM attention mask and position ids Debug" ] }, { "cell_type": "code", "execution_count": null, "id": "b12dda63", "metadata": {}, "outputs": [], "source": [ "encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)\n", "prompt_length = len(encoded_prompt['input_ids'])\n", "label_length = len(tokenizer.tokenize(label)) + 1\n", "if prompt_length + label_length > max_length:\n", " num_tokens_to_remove = prompt_length + label_length - max_length\n", " for _ in range(num_tokens_to_remove):\n", " if prompt_length > label_length:\n", " prompt_length -= 1\n", " else:\n", " label_length -= 1\n", "else:\n", " label_length = max_length - prompt_length\n", "assert prompt_length > 0\n", "assert label_length > 0\n", "assert prompt_length + label_length <= max_length\n", "inputs = tokenizer(prompt, prefix + tokenizer.mask_token,\n", " max_length=prompt_length,\n", " truncation=\"only_first\",\n", " return_tensors=\"pt\",\n", " return_attention_mask=True,\n", " return_token_type_ids=False)\n", "inputs_glm = tokenizer.build_inputs_for_generation(inputs, targets=label,\n", " max_gen_length=label_length, padding=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "6521cafd", "metadata": {}, "outputs": [], "source": [ "tokenizer.padding_side = \"left\"\n", "encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)\n", "prompt_length = len(encoded_prompt['input_ids'])\n", "inputs = tokenizer(prompt, prefix + tokenizer.mask_token,\n", " max_length=max_prompt_length,\n", " padding=\"max_length\",\n", " truncation=\"only_first\",\n", " return_tensors=\"pt\",\n", " return_token_type_ids=False)\n", "# max_gen_length = max_length - prompt_length\n", "# max_gen_length = self.args.max_gen_length\n", "assert prompt_length > 0\n", "assert max_gen_length > 0\n", "assert prompt_length + max_gen_length <= max_length\n", "inputs_glm = tokenizer.build_inputs_for_generation(inputs, max_gen_length=max_gen_length, padding=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "d6344ed6", "metadata": {}, "outputs": [], "source": [ "print(inputs)\n", "print(inputs['input_ids'].shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "2fb5447b", "metadata": {}, "outputs": [], "source": [ "print(inputs_glm)\n", "print(inputs_glm['input_ids'].shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "f7edfa03", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: notebook/sft.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "6f526208", "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(0, \"/root/autodl-tmp/Code/RLHF\")\n", "sys.path.insert(0, \"/Users/zeyesun/Documents/Code/RLHF\")\n", "sys.path.insert(0, \"D:\\\\Code\\\\RLHF\")\n", "sys.path.insert(0, \"/mnt/sfevol775196/sunzeye273/Code/chatgpt\")\n", "sys.path.insert(0, \"/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt\")\n", "sys.path.insert(0, \"/mnt/pa002-28359-vol543625-private/Code/chatgpt\")\n", "\n", "import os, time, re, random, glob, json, jieba, copy\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import torch\n", "import loralib as lora\n", "from transformers import (\n", " AutoModelForCausalLM,\n", " AutoModelForSeq2SeqLM,\n", " AutoModelForMultipleChoice,\n", " AutoTokenizer,\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator,\n", " TextGenerationPipeline\n", ")\n", "\n", "from src.models.lora import convert_to_lora_recursively\n", "\n", "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "from sys import platform\n", "if platform == \"linux\" or platform == \"linux2\":\n", " # linux\n", " root = \"/mnt/sfevol775196/sunzeye273/Data\"\n", "# root = \"/mnt/share-pa002-vol682688-prd/sunzeye273/Data\"\n", "# root = \"/mnt/pa002-28359-vol543625-private/Data\"\n", "# root = \"/root/autodl-tmp/Data\"\n", "elif platform == \"darwin\":\n", " # OS X\n", " root = \"/Users/zeyesun/Documents/Data\"\n", "elif platform == \"win32\":\n", " # Windows...\n", " root = \"D:\\\\Data\"" ] }, { "cell_type": "code", "execution_count": null, "id": "633e69a5", "metadata": {}, "outputs": [], "source": [ "# model_name = \"pangu-350M\"\n", "# model_name = \"glm-350M-chinese\"\n", "model_name = \"chatglm-6B\"\n", "model_name_or_path = os.path.join(root, \"models\", model_name)\n", "# model_name_or_path = os.path.join(\"/mnt/pa002-28359-vol543625-share/LLM-data/checkpoint\", model_name)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)\n", "print(tokenizer.special_tokens_map)\n", "print(tokenizer.all_special_ids)\n", "print(\n", " f\"unk: {tokenizer.unk_token_id}\\n\",\n", " f\"pad: {tokenizer.pad_token_id}\\n\",\n", " f\"bos: {tokenizer.bos_token_id}\\n\",\n", " f\"eos: {tokenizer.eos_token_id}\\n\",\n", " f\"sep: {tokenizer.sep_token_id}\\n\",\n", " f\"mask: {tokenizer.mask_token_id}\\n\",\n", "# f\"eop: {tokenizer.eop_token_id}\\n\"\n", "# f\"sop: {tokenizer.sop_token_id}\\n\"\n", "# f\"cls: {tokenizer.cls_token_id}\"\n", ") " ] }, { "cell_type": "code", "execution_count": null, "id": "7c62c599", "metadata": {}, "outputs": [], "source": [ "lora_rank = 0\n", "lora_alpha = 1\n", "lora_train_bias = \"none\"" ] }, { "cell_type": "code", "execution_count": null, "id": "b2cae21a", "metadata": {}, "outputs": [], "source": [ "if \"glm\" in model_name_or_path:\n", " model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)\n", " if \"chatglm\" in model_name_or_path:\n", " model = model.half()\n", "else:\n", " model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, use_cache=False)\n", " model.resize_token_embeddings(tokenizer.vocab_size)\n", "if lora_rank > 0:\n", " convert_to_lora_recursively(model, lora_rank, lora_alpha)\n", " lora.mark_only_lora_as_trainable(model, lora_train_bias)\n", "model.to(device)\n", "print(model.device)" ] }, { "cell_type": "markdown", "id": "8a28f07a", "metadata": {}, "source": [ "# SFT Prediction" ] }, { "cell_type": "code", "execution_count": null, "id": "b003a1c8", "metadata": {}, "outputs": [], "source": [ "checkpoint_files = os.path.join(root, \"chatgpt\", \"output\", \"sft\", \"pangu-350M\", \"checkpoint-57043\", \"pytorch_model*.bin\")\n", "# checkpoint_files = os.path.join(root, \"chatgpt\", \"output\", \"sft\", \"pangu-2.6B\", \"pangu-2.6B-sft\", \"pytorch_model*.bin\")\n", "checkpoints = glob.glob(checkpoint_files)\n", "st = dict()\n", "for checkpoint in checkpoints:\n", " st.update(torch.load(checkpoint, map_location=\"cpu\"))\n", "model.load_state_dict(st)" ] }, { "cell_type": "code", "execution_count": null, "id": "9330b5aa", "metadata": {}, "outputs": [], "source": [ "max_length = 512\n", "max_length_generation = 50\n", "num_return_sequences = 1\n", "top_p = 0.8\n", "top_k = 50\n", "temperature = 5.0\n", "# prompt = '今天晚上我在睡觉.........他想要做那些事..我就大大声骂他\"不要吵我睡觉\"!!!!!...他就跑出去了...还不接我电话'\n", "# prompt = \"\"\"阅读文章:《战国无双3》()是由光荣和ω-force开发的战国无双系列的正统第三续作。本作以三大故事为主轴,分别是以武田信玄等人为主的《关东三国志》,织田信长等人为主的《战国三杰》,石田三成等人为主的《关原的年轻武者》,丰富游戏内的剧情。此部份专门介绍角色,欲知武器情报、奥义字或擅长攻击类型等,请至战国无双系列1.由于乡里大辅先生因故去世,不得不寻找其他声优接手。从猛将传 and Z开始。2.战国无双 编年史的原创男女主角亦有专属声优。此模式是任天堂游戏谜之村雨城改编的新增模式。本作中共有20张战场地图(不含村雨城),后来发行的猛将传再新增3张战场地图。但游戏内战役数量繁多,部分地图会有兼用的状况,战役虚实则是以光荣发行的2本「战国无双3 人物真书」内容为主,以下是相关介绍。(注:前方加☆者为猛将传新增关卡及地图。)合并本篇和猛将传的内容,村雨城模式剔除,战国史模式可直接游玩。主打两大模式「战史演武」&「争霸演武」。系列作品外传作品\\n问:《战国无双3》是由哪两个公司合作开发的?\"\"\"\n", "# prefix = \"答:\"\n", "\n", "prompt = \"以王维的风格,写一首五言诗,题为和宋中丞夏日游福贤观天长寺寺即陈左相宅所施之作\"\n", "prefix = \"\"\n", "\n", "# prompt = \"上联:东风执笔点龙睛,看幸福指数,天天向上\"\n", "# prefix = \"下联:\"\n", "\n", "prefix = \"\"\n", "while True:\n", " if \"chatglm\" in model_name_or_path:\n", " encoded_prompt = tokenizer(prompt)\n", " prompt_length = len(encoded_prompt['input_ids'])\n", " inputs = tokenizer(prompt, max_length=min(prompt_length, max_length), truncation=\"only_first\",\n", " return_tensors=\"pt\")\n", " inputs = inputs.to(device)\n", " outputs = model.generate(**inputs,\n", " max_new_tokens=max_length_generation,\n", " eos_token_id=tokenizer.eop_token_id,\n", " pad_token_id=tokenizer.pad_token_id,\n", " do_sample=False,\n", " num_return_sequences=num_return_sequences,\n", " top_p=top_p,\n", " temperature=temperature)\n", " elif \"glm\" in model_name_or_path:\n", " encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)\n", " prompt_length = len(encoded_prompt['input_ids'])\n", " inputs = tokenizer(prompt, prefix + tokenizer.mask_token,\n", "# max_length=max_length - max_length_generation,\n", "# padding=\"max_length\",\n", " max_length=min(prompt_length, max_length),\n", " truncation=\"only_first\",\n", " return_tensors=\"pt\",\n", " return_attention_mask=True,\n", " return_token_type_ids=False)\n", " max_length_generation = max_length - inputs['input_ids'].shape[1]\n", " inputs_glm = tokenizer.build_inputs_for_generation(inputs, \n", " max_gen_length=max_length_generation, padding=True)\n", " inputs_glm = inputs_glm.to(device)\n", " outputs = model.generate(**inputs_glm,\n", " max_new_tokens=max_length_generation,\n", " eos_token_id=tokenizer.eop_token_id,\n", " pad_token_id=tokenizer.pad_token_id,\n", " do_sample=False,\n", " num_return_sequences=num_return_sequences,\n", " top_p=top_p,\n", " temperature=temperature)\n", " else:\n", " inputs = tokenizer(prompt, tokenizer.sep_token + prefix, \n", " max_length=max_length, \n", " return_tensors=\"pt\",\n", " truncation=\"only_first\", \n", "# padding=\"max_length\",\n", " add_special_tokens=False,\n", " return_token_type_ids=False)\n", " inputs = inputs.to(device)\n", " outputs = model.generate(**inputs,\n", " max_new_tokens=max_length_generation,\n", " pad_token_id=tokenizer.pad_token_id,\n", " do_sample=True,\n", " num_return_sequences=num_return_sequences,\n", "# top_p=top_p,\n", " top_k=top_k,\n", " temperature=temperature)\n", " results = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n", "# results = [result.split(prefix, maxsplit=1)[1] for result in results]\n", " print(results)\n", " break" ] }, { "cell_type": "code", "execution_count": null, "id": "9dbb3482", "metadata": {}, "outputs": [], "source": [ "# f = os.path.join(root, \"raw\", \"baike_qa_train.json\")\n", "# [baike_qa.jsonl, chinese_classical.jsonl, chinese_poetry.jsonl, couplets.jsonl, weibo_summary_comments.jsonl, zhidao.jsonl]\n", "f = os.path.join(root, \"chatgpt\", \"processed\", \"baike_qa.jsonl\")\n", "i = 0\n", "prompts = []\n", "prompts_processed = []\n", "labels = []\n", "with open(f, \"r\", encoding=\"utf-8\") as r:\n", " while True:\n", " line = r.readline()\n", " if not line:\n", " break\n", " item = json.loads(line.strip(\"\\n\"))\n", " # prompt = clean_text(item['title'] if len(item['title']) > len(item['desc']) else item['desc'])\n", " # prompt_processed = prompt + tokenizer.sep_token + prefix\n", " # label = clean_text(item['answer'])\n", " prompt = item['prompt']\n", " prompt_processed = prompt\n", " label = item['answers'][0]['answer']\n", " prompts.append(prompt)\n", " prompts_processed.append(prompt_processed)\n", " labels.append(label)\n", " i += 1\n", " # if i > 1000:\n", " # break\n", "random.shuffle(prompts_processed)\n", "print(len(prompts_processed))" ] }, { "cell_type": "code", "execution_count": null, "id": "0ad64148", "metadata": {}, "outputs": [], "source": [ "i = 79\n", "num_return_sequences = 2\n", "max_length = 512\n", "max_length_generation = 100\n", "top_k = 50\n", "top_p = 0.8\n", "temperature = 1.0\n", "t1 = time.time()\n", "prompt = prompts_processed[i]\n", "inputs = tokenizer(prompt, add_special_tokens=False, return_token_type_ids=False, return_tensors=\"pt\")\n", "inputs = inputs.to(device)\n", "outputs = model.generate(**inputs,\n", " max_new_tokens=max_length_generation,\n", " pad_token_id=tokenizer.pad_token_id,\n", " do_sample=True,\n", " num_return_sequences=num_return_sequences,\n", " # top_p=top_p,\n", " top_k=top_k,\n", " temperature=temperature)\n", "results = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n", "for result in results:\n", " # result.split(prefix, maxsplit=1)[1]\n", " print(result)\n", "\n", "\n", "# results = text_generator(prompts_processed[i:j], max_length=200, num_return_sequences=num_return_sequences,\n", "# do_sample=True, top_k=50, temperature=10.0)\n", "# print(f\"Finished prediction, time taken: {time.time()-t1}\")\n", "\n", "# for prompt, res, label in zip(prompts[i:j], results[:(j-i)], labels[i:j]):\n", "# print(f\"prompt: {prompt}\\nlabel: {label}\")\n", "# for k in range(num_return_sequences):\n", "# model_answer = res[k]['generated_text'].split(prefix)[1].replace(\"\", \"\").replace(\"\", \"\")\n", "# print(f\"model answer-{k}: {model_answer}\")\n", "# print(\"\\n\\n\")" ] }, { "cell_type": "markdown", "id": "da1e08ce", "metadata": {}, "source": [ "# SFT Training" ] }, { "cell_type": "code", "execution_count": null, "id": "9eef5821", "metadata": {}, "outputs": [], "source": [ "# prompt = \"你是谁\"\n", "# prefix = \"答:\"\n", "# label = \"我是***,很高兴为你服务\"\n", "prompt = \"\"\"倍数金额: 1倍,¥1024 元 场次 主---------客队 投注选项- 参考赔率---投注选项胜 平 负---\n", "第一次为基本面投注---第二次为通过处理后投注 1 伯明翰 VS -----维冈 31-----1.93 3.27 3.87 --- 2伯恩利VS---朴茨茅30----- ---3 博尔顿 VS -----狼队\n", "3------1.94 3.25 3.88 --- 4 斯托克 VS ---阿森纳 0------5.03 3.47 1.68 --- 5 门兴 VS -----弗赖堡 31-----1.77 3.倍数金额: 1倍,¥1024 元 场次\n", "主---------客队 投注选项- 参考赔率---投注选项胜 平 负--- 第一次为基本面投注---第二次为通过处理后投注 1 伯明翰 VS -----维冈 31-----1.93 3.27 3.87 ---\n", "2伯恩利VS---朴茨茅30----- ---3 博尔顿 VS -----狼队 3------1.94 3.25 3.88 --- 4 斯托克 VS ---阿森纳 0------5.03 3.47 1.68 --- 5 门兴 VS -----弗赖堡\n", "31-----1.77 3.39 4.43 --- 6 美因兹 VS ---不来梅 10-----3.76 3.34 1.92 --- 7波鸿VS-----纽伦堡30----- ---8 斯图加 VS ---法兰克 31-----1.59 3.62 5.47\n", "--- 9 赫塔 VS -----霍芬海 30-----2.49 3.19 2.69 --- 10 勒沃 VS ------科隆 3------1.35 4.44 8.31 --- 11卡塔尼VS----巴里31----- ---12 拉齐奥 VS\n", "--佛罗伦 31-----2.35 3.05 3.01 --- 13 特内里 VS ----皇马 0------9.43 4.95 1.29 --- 14 巴萨 VS ----马拉加 3------1.15 6.78 15.49 --\"\"\"\n", "prefix = \"回答:\"\n", "label = \"你出的赔率数据太早了,数据随时都会变化,这就是所谓要看临盘的道理,目前的数据没什么参考价值。\"\n", "max_length = 512\n", "encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)\n", "prompt_length = len(encoded_prompt['input_ids'])\n", "label_length = len(tokenizer.tokenize(label)) + (1 if \"chatglm\" not in model_name_or_path else 0)\n", "# print(f\"prompt length: {prompt_length}, label length: {label_length}\")\n", "if prompt_length + label_length > max_length:\n", " num_tokens_to_remove = prompt_length + label_length - max_length\n", " for _ in range(num_tokens_to_remove):\n", " if prompt_length > label_length:\n", " prompt_length -= 1\n", " else:\n", " label_length -= 1\n", "else:\n", " label_length = max_length - prompt_length\n", "assert prompt_length > 0\n", "assert label_length > 0\n", "assert prompt_length + label_length <= max_length\n", "encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token,\n", " max_length=prompt_length, truncation=\"only_first\",\n", " return_tensors=\"pt\", return_attention_mask=True)\n", "inputs = tokenizer.build_inputs_for_generation(encoded_dict, targets=label,\n", " max_gen_length=label_length, padding=True)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5d8b9e45", "metadata": {}, "outputs": [], "source": [ "print(tokenizer.special_tokens_map)\n", "print(tokenizer.all_special_ids)\n", "print(\n", " f\"unk: {tokenizer.unk_token_id}\\n\",\n", " f\"pad: {tokenizer.pad_token_id}\\n\",\n", " f\"bos: {tokenizer.bos_token_id}\\n\",\n", " f\"eos: {tokenizer.eos_token_id}\\n\",\n", " f\"sep: {tokenizer.sep_token_id}\\n\",\n", " f\"mask: {tokenizer.mask_token_id}\\n\",\n", "# f\"eop: {tokenizer.eop_token_id}\\n\"\n", "# f\"sop: {tokenizer.sop_token_id}\\n\"\n", "# f\"cls: {tokenizer.cls_token_id}\"\n", ") " ] }, { "cell_type": "code", "execution_count": null, "id": "f99c82e8", "metadata": {}, "outputs": [], "source": [ "print(tokenizer.convert_ids_to_tokens([20006]))\n", "print(tokenizer.convert_ids_to_tokens([20012]))" ] }, { "cell_type": "code", "execution_count": null, "id": "24d349a5", "metadata": {}, "outputs": [], "source": [ "print(encoded_prompt)\n", "print(tokenizer.decode(encoded_prompt['input_ids']))\n", "print(encoded_dict)\n", "print(tokenizer.batch_decode(encoded_dict['input_ids']))" ] }, { "cell_type": "code", "execution_count": null, "id": "9a63f417", "metadata": {}, "outputs": [], "source": [ "for key, val in inputs.items():\n", " print(f\"{key} shape: {val.shape}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7f78df96", "metadata": {}, "outputs": [], "source": [ "for key, val in inputs_glm.items():\n", " print(f\"{key} shape: {val.shape}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4261c76d", "metadata": {}, "outputs": [], "source": [ "print(inputs_glm['input_ids'][:20])" ] }, { "cell_type": "code", "execution_count": null, "id": "6667eaac", "metadata": {}, "outputs": [], "source": [ "print(inputs_glm['labels'][:20])" ] }, { "cell_type": "code", "execution_count": null, "id": "ac236f19", "metadata": {}, "outputs": [], "source": [ "print(inputs_glm['attention_mask'][0][9])" ] }, { "cell_type": "code", "execution_count": null, "id": "bda9e8ba", "metadata": {}, "outputs": [], "source": [ "print(inputs_glm['position_ids'][0][:20])\n", "print(inputs_glm['position_ids'][1][:20])" ] }, { "cell_type": "code", "execution_count": null, "id": "83126e24", "metadata": {}, "outputs": [], "source": [ "# st = model.state_dict()\n", "st.keys()" ] }, { "cell_type": "code", "execution_count": null, "id": "7320e1ca", "metadata": {}, "outputs": [], "source": [ "print(st['transformer.word_embeddings.weight'].dtype)\n", "print(st['transformer.layers.0.input_layernorm.weight'].dtype)" ] }, { "cell_type": "code", "execution_count": null, "id": "81289f7f", "metadata": {}, "outputs": [], "source": [ "dtypes = dict()\n", "for key, val in st.items():\n", " if val.dtype not in dtypes:\n", " dtypes[val.dtype] = list()\n", " dtypes[val.dtype].append(key)\n", "print(dtypes.keys())\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3f4e3327", "metadata": {}, "outputs": [], "source": [ "output = model(**inputs)" ] }, { "cell_type": "code", "execution_count": null, "id": "fcc223bf", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: notebook/test.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "93c83ce9", "metadata": {}, "outputs": [], "source": [ "import os, time, re, random, glob, json, jieba, copy\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import torch\n", "from transformers import (\n", " AutoConfig,\n", " AutoModelForCausalLM,\n", " AutoModelForSeq2SeqLM,\n", " AutoModelForMultipleChoice,\n", " AutoTokenizer,\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator\n", ")\n", "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "from sys import platform\n", "if platform == \"linux\" or platform == \"linux2\":\n", " # linux\n", " root = \"/mnt/sfevol775196/sunzeye273/Data\"\n", "# root = \"/mnt/share-pa002-vol682688-prd/sunzeye273/Data\"\n", "# root = \"/mnt/pa002-28359-vol543625-private/Data\"\n", "# root = \"/root/autodl-tmp/Data\"\n", "elif platform == \"darwin\":\n", " # OS X\n", " root = \"/Users/zeyesun/Documents/Data\"\n", "elif platform == \"win32\":\n", " # Windows...\n", " root = \"D:\\\\Data\"" ] }, { "cell_type": "code", "execution_count": null, "id": "3d6fdca2", "metadata": {}, "outputs": [], "source": [ "model_name = \"llama-7b\"\n", "# model_name = \"glm-350M-chinese\"\n", "# model_name = \"chatglm-6B\"\n", "model_name_or_path = os.path.join(root, \"models\", model_name)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)\n", "print(tokenizer.special_tokens_map)\n", "print(tokenizer.all_special_ids)\n", "print(\n", " f\"unk: {tokenizer.unk_token_id}\\n\",\n", " f\"pad: {tokenizer.pad_token_id}\\n\",\n", " f\"bos: {tokenizer.bos_token_id}\\n\",\n", " f\"eos: {tokenizer.eos_token_id}\\n\",\n", " f\"sep: {tokenizer.sep_token_id}\\n\",\n", " f\"mask: {tokenizer.mask_token_id}\\n\",\n", "# f\"eop: {tokenizer.eop_token_id}\\n\"\n", "# f\"sop: {tokenizer.sop_token_id}\\n\"\n", " f\"cls: {tokenizer.cls_token_id}\"\n", ") " ] }, { "cell_type": "code", "execution_count": 2, "id": "52c6acb9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2916.6666666666665" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "35000/ (3*4)" ] }, { "cell_type": "markdown", "id": "df9a2572", "metadata": {}, "source": [ "# LLaMa" ] }, { "cell_type": "code", "execution_count": null, "id": "6ccd5025", "metadata": {}, "outputs": [], "source": [ "model_name = \"llama-small\"\n", "model_name_or_path = os.path.join(root, \"models\", model_name)\n", "# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)\n", "config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)\n", "model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "d76f86cd", "metadata": {}, "outputs": [], "source": [ "f = os.path.join(model_name_or_path, \"pytorch_model.bin\")\n", "torch.save(model.state_dict(), f)" ] }, { "cell_type": "code", "execution_count": null, "id": "5405693b", "metadata": {}, "outputs": [], "source": [ "prompt = \"你好 你是谁 你好 你是谁\"\n", "eos_ids = [0, 5, 12, 16, 22]\n", "max_length = 512\n", "input_embeds = torch.rand(1, dtype=torch.float16)\n", "# tokenizer.tokenize(prompt, add_special_tokens=False)\n", "inputs = tokenizer(prompt, max_length=max_length, return_tensors=\"pt\", truncation=\"longest_first\", \n", " return_attention_mask=True, return_token_type_ids=False)\n", "# print(inputs)\n", "tgt_len = len(inputs['input_ids'][0])\n", "print(f\"tgt_len: {tgt_len}\")\n", "combined_attention_mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(input_embeds.dtype).min))\n", "# print(f\"shape: {combined_attention_mask.shape}, combined_attention_mask: {combined_attention_mask}\")\n", "for i in range(len(eos_ids)-1):\n", " attention_mask = torch.ones((1, eos_ids[i+1]-eos_ids[i]), dtype=torch.long)\n", "# print(f\"{i}, shape: {attention_mask.shape}, attention_mask: {attention_mask}\")\n", " attention_mask = _prepare_decoder_attention_mask(attention_mask, attention_mask.shape, \n", " input_embeds, 0)\n", " combined_attention_mask[eos_ids[i]:eos_ids[i+1], eos_ids[i]:eos_ids[i+1]] = attention_mask\n", "# print(f\"{i}-th, shape: {attention_mask.shape}, attention_mask: {attention_mask}\")\n", "# break\n", "# print(f\"shape: {combined_attention_mask.shape}, combined_attention_mask: {combined_attention_mask}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "317fe60a", "metadata": {}, "outputs": [], "source": [ "def _make_causal_mask(\n", " input_ids_shape, dtype, device, past_key_values_length = 0\n", "):\n", " \"\"\"\n", " Make causal mask used for bi-directional self-attention.\n", " \"\"\"\n", " bsz, tgt_len = input_ids_shape\n", " mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)\n", " mask_cond = torch.arange(mask.size(-1), device=device)\n", " mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)\n", " mask = mask.to(dtype)\n", "\n", " if past_key_values_length > 0:\n", " mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)\n", " return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)\n", "\n", "\n", "# Copied from transformers.models.bart.modeling_bart._expand_mask\n", "def _expand_mask(mask, dtype, tgt_len):\n", " \"\"\"\n", " Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.\n", " \"\"\"\n", " bsz, src_len = mask.size()\n", " tgt_len = tgt_len if tgt_len is not None else src_len\n", "\n", " expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)\n", "\n", " inverted_mask = 1.0 - expanded_mask\n", "\n", " return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)\n", "\n", "def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length):\n", " # create causal mask\n", " # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]\n", " combined_attention_mask = None\n", " if input_shape[-1] > 1:\n", " combined_attention_mask = _make_causal_mask(\n", " input_shape,\n", " inputs_embeds.dtype,\n", " device=inputs_embeds.device,\n", " past_key_values_length=past_key_values_length,\n", " )\n", "\n", " if attention_mask is not None:\n", " # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]\n", " expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(\n", " inputs_embeds.device\n", " )\n", " combined_attention_mask = (\n", " expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask\n", " )\n", "\n", " return combined_attention_mask" ] }, { "cell_type": "markdown", "id": "f589231f", "metadata": {}, "source": [ "# GLM" ] }, { "cell_type": "code", "execution_count": null, "id": "a7268832", "metadata": {}, "outputs": [], "source": [ "prompt = \"你好\"\n", "prefix = \"答:\"\n", "max_length = 128\n", "encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)\n", "prompt_length = len(encoded_prompt['input_ids'])\n", "encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token,\n", " max_length=min(prompt_length, max_length),\n", " truncation=\"only_first\",\n", " return_tensors=\"pt\",\n", " return_token_type_ids=False)\n", "max_gen_length = max_length - encoded_dict['input_ids'].shape[1]\n", "inputs = tokenizer.build_inputs_for_generation(encoded_dict, max_gen_length=max_gen_length, padding=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "a1fc75f2", "metadata": {}, "outputs": [], "source": [ "inputs" ] }, { "cell_type": "code", "execution_count": null, "id": "bfd9e66a", "metadata": {}, "outputs": [], "source": [ "batch_size, seq_length = encoded_dict['input_ids'].shape[:2]\n", "attention_mask1 = encoded_dict['attention_mask']\n", "attention_mask2 = attention_mask1.unsqueeze(1).expand(-1, seq_length + max_gen_length, -1)\n", "generation_attention_mask = torch.cat([attention_mask2.new_zeros((seq_length, max_gen_length)),\n", " torch.tril(attention_mask2.new_ones((max_gen_length, max_gen_length)))],\n", " dim=0).unsqueeze(0).expand(batch_size, -1, -1)\n", "attention_mask3 = torch.cat((attention_mask2, generation_attention_mask), dim=2)\n", "attention_mask4 = attention_mask3.unsqueeze(1)" ] }, { "cell_type": "code", "execution_count": null, "id": "5b629712", "metadata": {}, "outputs": [], "source": [ "print(attention_mask1.shape)\n", "print(attention_mask1)" ] }, { "cell_type": "code", "execution_count": null, "id": "6fd9341e", "metadata": {}, "outputs": [], "source": [ "print(attention_mask2.shape)\n", "# print(attention_mask2)" ] }, { "cell_type": "code", "execution_count": null, "id": "c9f75428", "metadata": {}, "outputs": [], "source": [ "print(attention_mask3.shape)\n", "print(attention_mask3)" ] }, { "cell_type": "code", "execution_count": null, "id": "4b600887", "metadata": {}, "outputs": [], "source": [ "print(attention_mask4.shape)\n", "print(attention_mask4)" ] }, { "cell_type": "markdown", "id": "7e028572", "metadata": {}, "source": [ "# ChatGLM" ] }, { "cell_type": "code", "execution_count": null, "id": "c25ba334", "metadata": {}, "outputs": [], "source": [ "model_name_or_path = os.path.join(root, \"models\", \"chatglm-6B\")\n", "# model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)\n", "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)\n", "# model = model.half().to(device)" ] }, { "cell_type": "code", "execution_count": null, "id": "cc68c1cc", "metadata": {}, "outputs": [], "source": [ "texts = [\n", " \"你好\",\n", " \"晚上睡不着应该怎么办\"\n", "]\n", "history = []\n", "for text in texts:\n", " response, history = model.chat(tokenizer, text, history=history)\n", " print(f\"问: {text}\\n答:{response}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b4ccfa2b", "metadata": {}, "outputs": [], "source": [ "max_length = 512\n", "prompt = \"晚上睡不着应该怎么办\"\n", "label = \"吃褪黑素\"\n", "encoded_dict = tokenizer(prompt, label,\n", " max_length=max_length, return_tensors=\"pt\",\n", " truncation=\"longest_first\", padding=\"max_length\", return_token_type_ids=False)\n", "print(encoded_dict.keys())\n", "print(encoded_dict['input_ids'].shape)\n", "print(encoded_dict['input_ids'][0, :50])" ] }, { "cell_type": "code", "execution_count": null, "id": "50baebfc", "metadata": {}, "outputs": [], "source": [ "ids = [ 94747, 103400, 84213, 86846, 150001, 150004, 20005, 84020, 110857,\n", " 84248, 84886, 20003]\n", "print([tokenizer._convert_id_to_token(i) for i in ids])" ] }, { "cell_type": "code", "execution_count": null, "id": "f9104b06", "metadata": {}, "outputs": [], "source": [ "input_ids = encoded_dict['input_ids']\n", "print(input_ids[0, :20])\n", "seqs = input_ids.tolist()\n", "mask_positions = [seq.index(tokenizer.gmask_token_id) for seq in seqs]\n", "gmask = True" ] }, { "cell_type": "code", "execution_count": null, "id": "b7b577cc", "metadata": {}, "outputs": [], "source": [ "batch_size, seq_length = input_ids.shape\n", "context_lengths = [seq.tolist().index(tokenizer.bos_token_id) for seq in input_ids]\n", "attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)\n", "attention_mask.tril_()\n", "for i, context_length in enumerate(context_lengths):\n", " attention_mask[i, :, :context_length] = 1\n", "attention_mask.unsqueeze_(1)\n", "# attention_mask = (attention_mask < 0.5).bool()\n", "print(attention_mask.shape)\n", "print(attention_mask[0, :20, :20])\n", "batch_size, seq_length = input_ids.shape\n", "context_lengths = [seq.tolist().index(tokenizer.bos_token_id) for seq in input_ids]\n", "# if self.position_encoding_2d:\n", "position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length)\n", "if not gmask:\n", " for i, context_length in enumerate(context_lengths):\n", " position_ids[i, context_length:] = mask_positions[i]\n", "block_position_ids = [torch.cat((\n", " torch.zeros(context_length, dtype=torch.long, device=device),\n", " torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1\n", ")) for context_length in context_lengths]\n", "block_position_ids = torch.stack(block_position_ids, dim=0)\n", "position_ids = torch.stack((position_ids, block_position_ids), dim=1)\n", "# else:\n", "# position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length)\n", "# if not gmask:\n", "# for i, context_length in enumerate(context_lengths):\n", "# position_ids[context_length:] = mask_positions[i]\n", "print(position_ids.shape)\n", "print(position_ids[0, :, :20])" ] }, { "cell_type": "code", "execution_count": null, "id": "d91e23d9", "metadata": {}, "outputs": [], "source": [ "print(tokenizer.tokenize(prompt))\n", "print(tokenizer(prompt))" ] }, { "cell_type": "code", "execution_count": null, "id": "e5d10f8c", "metadata": {}, "outputs": [], "source": [ "encoded_prompt = tokenizer(prompt)\n", "prompt_length = len(encoded_prompt['input_ids'])\n", "inputs = tokenizer(prompt, max_length=min(prompt_length, max_length), truncation=\"only_first\",\n", " return_tensors=\"pt\")\n", "inputs = inputs.to(device)" ] }, { "cell_type": "code", "execution_count": null, "id": "d6cd0dfa", "metadata": {}, "outputs": [], "source": [ "max_length_generation = 50\n", "num_return_sequences = 1\n", "top_p = 0.8\n", "temperature = 1.0\n", "outputs = model.generate(**inputs, max_new_tokens=max_length_generation,\n", " eos_token_id=tokenizer.eop_token_id,\n", " pad_token_id=tokenizer.pad_token_id,\n", " do_sample=False,\n", " num_return_sequences=num_return_sequences,\n", " top_p=top_p,\n", " temperature=temperature)" ] }, { "cell_type": "markdown", "id": "6f13279f", "metadata": {}, "source": [ "# DeepSpeed" ] }, { "cell_type": "markdown", "id": "7798b57b", "metadata": {}, "source": [ "### estimate the memory needs for params, optim states and gradients" ] }, { "cell_type": "code", "execution_count": null, "id": "07493af2", "metadata": {}, "outputs": [], "source": [ "# stage 1 and 2\n", "from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live\n", "estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=3, num_nodes=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "4cb3b908", "metadata": {}, "outputs": [], "source": [ "# stage 3\n", "from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live\n", "estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=3, num_nodes=1)" ] }, { "cell_type": "markdown", "id": "e9aa232d", "metadata": {}, "source": [ "# Split torch checkpoint into multiple checkpoints" ] }, { "cell_type": "code", "execution_count": null, "id": "705292a3", "metadata": {}, "outputs": [], "source": [ "checkpoint = os.path.join(root, \"chatgpt\", \"output\", \"sft\", \"pangu-2.6B\", \"checkpoint-42782\")\n", "if \"glm\" in model_name_or_path:\n", " model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, trust_remote_code=True)\n", "else:\n", " model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True, use_cache=False)\n", "st = model.state_dict()" ] }, { "cell_type": "code", "execution_count": null, "id": "e6d5ee32", "metadata": {}, "outputs": [], "source": [ "import sys\n", "print(sys.getsizeof(st))\n", "keys = list(st.keys())\n", "n = 10\n", "m = {\"metadata\": {\"total_size\":sys.getsizeof(st)}, \"weight_map\":dict()}\n", "span = len(keys) // n\n", "for i in range(n):\n", " fn = f\"pytorch_model-{i+1}-of-{n}.bin\"\n", " f = os.path.join(checkpoint, fn)\n", " stt = dict()\n", " for key in keys[i*span:(i+1)*span]:\n", " stt[key] = st[key]\n", " m[\"weight_map\"][key] = fn\n", " torch.save(stt, f)\n", "f = os.path.join(checkpoint, \"pytorch_model.bin.index.json\")\n", "json.dump(m, open(f, \"w\", encoding=\"utf-8\"), ensure_ascii=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: requirements.txt ================================================ jieba sentencepiece nltk>=3.8.1 rouge-score>=0.1.2 evaluate>=0.4.0 numpy==1.23.2 tensorflow>=2.5.0 transformers>=4.31.0 tokenizers>=0.13.3 #triton==1.0.0 triton ninja torchmetrics opencc torchtyping accelerate>=0.19.0 ray tqdm rich icetk cpm_kernels protobuf==3.20.0 pynvml bitsandbytes>=0.39.0 peft>=0.4.0 transformers_stream_generator trl trlx; trlx ================================================ FILE: src/__init__.py ================================================ ================================================ FILE: src/data/__init__.py ================================================ ================================================ FILE: src/data/data.py ================================================ import os import json import re import random from typing import Tuple, List import torch import pandas as pd from tqdm import tqdm from torch.utils.data import Dataset from torch.nn.utils.rnn import pad_sequence import torch.nn.functional as F from transformers import PreTrainedTokenizerBase from src.utils import logger, RESOURCE_PATH from src.utils.modeling_utils import _prepare_decoder_attention_mask, qwen_make_context from src.utils.file_utils import print_rank_0 def chatglm3_encode(tokenizer: PreTrainedTokenizerBase, query: str, label: str = None, system: str = "", max_length: int = 1024, is_prefix: bool = True ) -> Tuple[List[int], List[int], List[int]]: '''Use chatglm3 tokenizer to encode prompt + label with "longest_first" truncation strategy :param tokenizer: :param prompt: :param label: :param system: :param max_length: :return: ''' prefix_tokens = tokenizer.get_prefix_tokens() role_tokens_1 = [tokenizer.get_command(f"<|user|>")] + tokenizer.encode(f"\n", add_special_tokens=False) # Process `system` and `query` if is_prefix: system_ids = tokenizer.encode(system + "\n\n", add_special_tokens=False) if len(system) > 0 else [] query_ids = tokenizer.encode(" " + query, add_special_tokens=False)[1:] else: system_ids = tokenizer.encode(" \n\n" + system, add_special_tokens=False)[1:] if len(system) > 0 else [] query_ids = tokenizer.encode(query, add_special_tokens=False) # Process `label` role_tokens_2 = [tokenizer.get_command(f"<|assistant|>")] if label is not None: label_ids = tokenizer.encode(label, add_special_tokens=False) end_tokens = [tokenizer.get_command("")] else: label_ids = [] end_tokens = [] # Remove overflowing tokens num_tokens_to_remove = len(prefix_tokens) + len(role_tokens_1) + len(query_ids) + len(system_ids) + \ len(role_tokens_2) + len(label_ids) + len(end_tokens) - max_length if num_tokens_to_remove > 0: for _ in range(num_tokens_to_remove): if len(query_ids) + len(system_ids) > len(label_ids) and len(query_ids) > 0: query_ids.pop() elif len(label_ids) > 0: label_ids.pop() else: logger.warn("removing system tokens due to tokens overflowing") system_ids.pop() if label is not None: label_ids += end_tokens else: if label is not None: label_ids += end_tokens label_ids += [tokenizer.pad_token_id] * -num_tokens_to_remove if is_prefix: prompt_ids = prefix_tokens + role_tokens_1 + system_ids + query_ids + role_tokens_2 else: prompt_ids = prefix_tokens + role_tokens_1 + query_ids + system_ids + role_tokens_2 input_ids = prompt_ids + label_ids labels = [tokenizer.pad_token_id] * len(prompt_ids) + label_ids assert len(input_ids) == len(labels) == max_length return input_ids, labels, prompt_ids def chatglm2_encode(tokenizer: PreTrainedTokenizerBase, query: str, label: str = None, system: str = "", max_length: int = 1024, is_prefix: bool = True ) -> Tuple[List[int], List[int], List[int]]: '''Use chatglm2 tokenizer to encode prompt + label with "longest_first" truncation strategy :param tokenizer: :param prompt: :param label: :param system: :param max_length: :return: ''' gmask_id = tokenizer.get_command("[gMASK]") sop_id = tokenizer.get_command("sop") eop_id = tokenizer.get_command("eop") # [Round {1}]\n\n问: ids1 = [790, 30951, 517, 30910, 30939, 30996, 13, 13, 54761, 31211] # \n\n答: ids2 = [13, 13, 55437, 31211] if len(system) > 0: if is_prefix: system_ids = tokenizer.encode(" " + system + "\n\n", add_special_tokens=False)[1:] else: system_ids = tokenizer.encode(" \n\n" + system, add_special_tokens=False)[1:] else: system_ids = [] query_ids = tokenizer.encode(" " + query, add_special_tokens=False)[1:] if label is not None: label_ids = tokenizer.encode(label, add_special_tokens=False) num_special_tokens = 3 else: label_ids = [] num_special_tokens = 2 num_tokens_to_remove = len(ids1) + len(query_ids) + len(system_ids) + len(ids2) + \ len(label_ids) + num_special_tokens - max_length if num_tokens_to_remove > 0: for _ in range(num_tokens_to_remove): if len(query_ids) + len(system_ids) > len(label_ids) and len(query_ids) > 0: query_ids.pop() elif len(label_ids) > 0: label_ids.pop() else: logger.warn("removing system tokens due to tokens overflowing") system_ids.pop() if label is not None: label_ids += [eop_id] else: if label is not None: label_ids += [eop_id] label_ids += [tokenizer.pad_token_id] * -num_tokens_to_remove if is_prefix: prompt_ids = [gmask_id, sop_id] + ids1 + system_ids + query_ids + ids2 else: prompt_ids = [gmask_id, sop_id] + ids1 + query_ids + system_ids + ids2 input_ids = prompt_ids + label_ids labels = [tokenizer.pad_token_id] * len(prompt_ids) + label_ids assert len(input_ids) == len(labels) == max_length return input_ids, labels, prompt_ids class DataCollatorReward: def __call__(self, data): has_attention_mask = 'attention_mask' in data[0] batch = { "chosen_input_ids": torch.stack([f['input_ids'] for f in data]), "chosen_attention_mask": torch.stack([f['attention_mask'] for f in data]) if has_attention_mask else None, # "input_ids": torch.cat([f[0] for f in data] + [f[2] for f in data]), # "attention_mask": torch.cat([f[1] for f in data] + [f[3] for f in data]), # "labels": torch.tensor([0] * len(data) + [1] * len(data)) } return batch class DataCollatorRLHF: def __init__(self, max_token_len, inference_tp_size): self.max_token_len = max_token_len self.inference_tp_size = inference_tp_size def __call__(self, data): batch = {} pad_token_id = data[-1][-1] prompt = pad_sequence([f[0] for f in data], padding_value=pad_token_id, batch_first=True) prompt_mask = pad_sequence([f[1] for f in data], padding_value=0, batch_first=True) ### make sure the final ouput is a seqence of 2**? length = prompt.size()[-1] pad_length = self.max_token_len - length if pad_length > 0: batch["prompt"] = F.pad(prompt, pad=(pad_length, 0), mode='constant', value=pad_token_id) batch["prompt_att_mask"] = F.pad(prompt_mask, pad=(pad_length, 0), mode='constant', value=0) else: batch["prompt"] = prompt batch["prompt_att_mask"] = prompt_mask batch["prompt"] = batch["prompt"].flip(1) batch["prompt_att_mask"] = batch["prompt_att_mask"].flip(1) return batch class PretrainDataset(Dataset): def __init__(self, args, filename, tokenizer, concat_samples=True): self.args = args self.tokenizer = tokenizer self.concat_samples = concat_samples self.model_name_or_path = args.model_name_or_path if hasattr(args, "model_name_or_path") else args.actor_model_path self.post_list = self.load_dataset(filename) for k in range(5): print_rank_0(f"PretrainDataset sample-{k}\n: {self.post_list[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] if not self.concat_samples: prompt = data['prompt'] label = data.get('label', None) if "glm" in self.model_name_or_path.lower() and "chatglm" not in self.model_name_or_path.lower(): encoded_prompt = self.tokenizer(prompt, self.tokenizer.mask_token) prompt_length = len(encoded_prompt['input_ids']) label_length = len(self.tokenizer.tokenize(label)) + 1 if prompt_length + label_length > self.args.max_length: num_tokens_to_remove = prompt_length + label_length - self.args.max_length for _ in range(num_tokens_to_remove): if prompt_length > label_length: prompt_length -= 1 else: label_length -= 1 else: label_length = self.args.max_length - prompt_length assert prompt_length > 0 assert label_length > 0 assert prompt_length + label_length == self.args.max_length encoded_dict = self.tokenizer(prompt, self.tokenizer.mask_token, max_length=prompt_length, truncation="only_first", return_tensors="pt", return_attention_mask=True, return_token_type_ids=False) encoded_dict = self.tokenizer.build_inputs_for_generation(encoded_dict, targets=label, max_gen_length=label_length, padding=True) return { "input_ids": encoded_dict['input_ids'][0], "position_ids": encoded_dict['position_ids'][0], "attention_mask": encoded_dict['attention_mask'][0], "labels": encoded_dict['labels'][0], } else: if "chatglm2" in self.model_name_or_path.lower(): prompt = f"[Round {1}]\n\n问:{prompt}\n\n答:" label = label elif "chatglm" in self.model_name_or_path.lower(): prompt = f"[Round {0}]\n问:{prompt}\n答:" label = label elif "vicuna" in self.model_name_or_path.lower(): prompt += "\n\n" + label label = None else: label = None encoded_dict = self.tokenizer(prompt, label, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_token_type_ids=False, return_tensors="pt", ) if "pangu" in self.model_name_or_path.lower(): return { "input_ids": encoded_dict['input_ids'], "attention_mask": encoded_dict['attention_mask'], "labels": encoded_dict['input_ids'], } else: result = { "input_ids": encoded_dict['input_ids'][0], "labels": encoded_dict['input_ids'][0], } if 'attention_mask' in encoded_dict: result["attention_mask"] = encoded_dict['attention_mask'][0] return result else: eos_ids = data['eos_ids'] input_ids = data['input_ids'] combined_attention_mask = torch.full((self.args.max_length, self.args.max_length), torch.tensor(torch.finfo(torch.float16).min)) for i in range(len(eos_ids) - 1): attention_mask = torch.ones((1, eos_ids[i + 1] - eos_ids[i]), dtype=torch.long) attention_mask = _prepare_decoder_attention_mask(attention_mask, attention_mask.shape, input_embeds=torch.ones(1, dtype=torch.float16, device="cpu"), past_key_values_length=0) logger.debug(f"{i}-th sample, shape: {attention_mask.shape}, attention_mask: {attention_mask}") combined_attention_mask[eos_ids[i]:eos_ids[i + 1], eos_ids[i]:eos_ids[i + 1]] = attention_mask logger.debug(f"shape: {combined_attention_mask.shape}, combined_attention_mask: {combined_attention_mask}") if "chatglm2" in self.model_name_or_path.lower(): return { "input_ids": input_ids, "labels": input_ids, "full_attention_mask": combined_attention_mask, } else: return { "input_ids": input_ids, "labels": input_ids, "attention_mask": combined_attention_mask, } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: data = [] eos_ids = [0] length = 0 for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) prompt = str(item['prompt']) label = item.get('label', None) if len(prompt) <= 0: discard += 1 continue if not self.concat_samples: datasets.append({"prompt": prompt, "label": label}) else: if "chatglm2" not in self.model_name_or_path.lower(): assert "glm" not in self.model_name_or_path.lower(), \ "Concatenating samples for GLM or ChatGLM not implemented yet" if "chatglm2" in self.model_name_or_path.lower(): prompt = f"[Round {1}]\n\n问:{prompt}\n\n答:" else: prompt = prompt if label is None else "\n\n".join((prompt, label)) label = None token_ids = self.tokenizer.encode(prompt, label, max_length=self.args.max_length - length, truncation="longest_first") if length + len(token_ids) < self.args.max_length: data.extend(token_ids) length += len(token_ids) eos_ids.append(length) else: data.extend(token_ids[:(self.args.max_length - length)]) eos_ids.append(self.args.max_length) datasets.append({"input_ids": data, "eos_ids": eos_ids}) data = [] eos_ids = [0] length = 0 print_rank_0( f"Finished loading {os.path.basename(filename)}, # samples: {len(datasets)}, # discarded: {discard}") return datasets class SFTDataset(Dataset): def __init__(self, args, filename, tokenizer, concat_samples=True): self.args = args self.tokenizer = tokenizer self.concat_samples = concat_samples self.model_name_or_path = args.model_name_or_path if hasattr(args, "model_name_or_path") else args.actor_model_path self.post_list = self.load_dataset(filename) for k in range(5): print_rank_0(f"SFTDataset sample-{k}\n: {self.post_list[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] if not self.concat_samples: prompt = data['prompt'] label = data['label'] prefix = data['prefix'] system = data['system'] if "glm" in self.model_name_or_path.lower() and "chatglm" not in self.model_name_or_path.lower(): encoded_prompt = self.tokenizer(prompt, prefix + self.tokenizer.mask_token) prompt_length = len(encoded_prompt['input_ids']) label_length = len(self.tokenizer.tokenize(label)) + 1 if prompt_length + label_length > self.args.max_length: num_tokens_to_remove = prompt_length + label_length - self.args.max_length for _ in range(num_tokens_to_remove): if prompt_length > label_length: prompt_length -= 1 else: label_length -= 1 else: label_length = self.args.max_length - prompt_length assert prompt_length > 0 assert label_length > 0 assert prompt_length + label_length == self.args.max_length encoded_dict = self.tokenizer(prompt, prefix + self.tokenizer.mask_token, max_length=prompt_length, truncation="only_first", return_tensors="pt", return_attention_mask=True, return_token_type_ids=False) encoded_dict = self.tokenizer.build_inputs_for_generation(encoded_dict, targets=label, max_gen_length=label_length, padding=True) return { "input_ids": encoded_dict['input_ids'][0], "position_ids": encoded_dict['position_ids'][0], "attention_mask": encoded_dict['attention_mask'][0], "labels": encoded_dict['labels'][0], } elif "pangu" in self.model_name_or_path.lower(): label = prefix + label encoded_dict = self.tokenizer(prompt, label, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_token_type_ids=False, return_tensors="pt", ) return { "input_ids": encoded_dict['input_ids'], "attention_mask": encoded_dict['attention_mask'], "labels": encoded_dict['input_ids'], } elif "chatglm3" in self.model_name_or_path.lower(): input_ids, labels, _ = chatglm3_encode(self.tokenizer, prompt, label, system, self.args.max_length) return { "input_ids": torch.tensor(input_ids, dtype=torch.long), # "attention_mask": torch.ones(len(input_ids), dtype=torch.long), "labels": torch.tensor(labels, dtype=torch.long), } elif "chatglm2" in self.model_name_or_path.lower(): input_ids, labels, _ = chatglm2_encode(self.tokenizer, prompt, label, system, self.args.max_length) # gmask_id = self.tokenizer.get_command("[gMASK]") # sop_id = self.tokenizer.get_command("sop") # eop_id = self.tokenizer.get_command("eop") # # [Round {1}]\n\n问: # ids1 = [790, 30951, 517, 30910, 30939, 30996, 13, 13, 54761, 31211] # # \n\n答: # ids2 = [13, 13, 55437, 31211] # prompt = "\n\n".join((system, prompt)) # prompt_ids = self.tokenizer.encode(" " + prompt, add_special_tokens=False)[1:] # label_ids = self.tokenizer.encode(label, add_special_tokens=False) # num_tokens_to_remove = len(ids1) + len(prompt_ids) + len(ids2) + len(label_ids) + 3 - self.args.max_length # if num_tokens_to_remove > 0: # for _ in range(num_tokens_to_remove): # if len(prompt_ids) > len(label_ids): # prompt_ids.pop() # else: # label_ids.pop() # prompt_ids = [gmask_id, sop_id] + ids1 + prompt_ids + ids2 # label_ids = label_ids + [eop_id] # else: # prompt_ids = [gmask_id, sop_id] + ids1 + prompt_ids + ids2 # label_ids = label_ids + [eop_id] + [self.tokenizer.pad_token_id] * -num_tokens_to_remove # input_ids = prompt_ids + label_ids # labels = [self.tokenizer.pad_token_id] * len(prompt_ids) + label_ids # assert len(input_ids) == len(labels) == self.args.max_length return { "input_ids": torch.tensor(input_ids, dtype=torch.long), # "attention_mask": torch.ones(len(input_ids), dtype=torch.long), "labels": torch.tensor(labels, dtype=torch.long), } elif "chatglm" in self.model_name_or_path.lower(): prompt = f"[Round {0}]\n问:{prompt}\n答:" encoded_dict = self.tokenizer(prompt, label, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_token_type_ids=False, return_tensors="pt", ) return { "input_ids": encoded_dict['input_ids'][0], "attention_mask": encoded_dict['attention_mask'][0], "labels": encoded_dict['input_ids'][0], } else: encoded_dict = self.tokenizer(prompt, label, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_token_type_ids=False, return_tensors="pt", ) result = { "input_ids": encoded_dict['input_ids'][0], "labels": encoded_dict['input_ids'][0], } if 'attention_mask' in encoded_dict: result["attention_mask"] = encoded_dict['attention_mask'][0] return result else: eos_ids = data['eos_ids'] input_ids = data['input_ids'] combined_attention_mask = torch.full((self.args.max_length, self.args.max_length), torch.tensor(torch.finfo(torch.float16).min)) for i in range(len(eos_ids) - 1): attention_mask = torch.ones((1, eos_ids[i + 1] - eos_ids[i]), dtype=torch.long) attention_mask = _prepare_decoder_attention_mask(attention_mask, attention_mask.shape, input_embeds=torch.ones(1, dtype=torch.float16, device="cpu"), past_key_values_length=0) logger.debug(f"{i}-th sample, shape: {attention_mask.shape}, attention_mask: {attention_mask}") combined_attention_mask[eos_ids[i]:eos_ids[i + 1], eos_ids[i]:eos_ids[i + 1]] = attention_mask logger.debug(f"shape: {combined_attention_mask.shape}, combined_attention_mask: {combined_attention_mask}") if "chatglm2" in self.model_name_or_path.lower(): return { "input_ids": input_ids, "labels": input_ids, "full_attention_mask": combined_attention_mask, } else: return { "input_ids": input_ids, "labels": input_ids, "attention_mask": combined_attention_mask, } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: data = [] eos_ids = [0] length = 0 for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) data_type = item.get('data_type', "human_generated") if data_type != "human_generated": continue prompt = str(item['prompt']) label = str(item['answers'][0]['answer']) score = item['answers'][0]['score'] prefix = item.get('prefix', "") system = item.get('system', "") if len(prompt) <= 0 or len(label) <= 0: discard += 1 continue if not self.concat_samples: datasets.append({"prompt": prompt, "label": label, "prefix": prefix, "system": system}) else: if "chatglm2" not in self.model_name_or_path.lower(): assert "glm" not in self.model_name_or_path.lower(), \ "Concatenating samples for GLM or ChatGLM not implemented yet" else: if "chatglm2" in self.model_name_or_path.lower(): prompt = f"[Round {1}]\n\n问:{prompt}\n\n答:" else: prompt = prompt if label is None else "\n\n".join((prompt, label)) label = None token_ids = self.tokenizer.encode(prompt, label, max_length=self.args.max_length - length, truncation="longest_first") if length + len(token_ids) < self.args.max_length: data.extend(token_ids) length += len(token_ids) eos_ids.append(length) else: data.extend(token_ids[:(self.args.max_length - length)]) eos_ids.append(self.args.max_length) datasets.append({"input_ids": data, "eos_ids": eos_ids}) data = [] eos_ids = [0] length = 0 print_rank_0( f"Finished loading {os.path.basename(filename)}, # samples: {len(datasets)}, # discarded: {discard}") return datasets class PairwiseDataset(Dataset): def __init__(self, args, filename, tokenizer): self.pairs = self.load_dataset(filename) self.args = args self.tokenizer = tokenizer for k in range(5): print_rank_0(f"PairwiseDataset sample-{k}\n: {self.pairs[k]}") def __len__(self): return len(self.pairs) def __getitem__(self, idx): pair = self.pairs[idx] prompt = pair["prompt"] chosen_answer = pair["chosen_answer"] rejected_answer = pair["rejected_answer"] prefix = pair['prefix'] system = pair['system'] if "pangu" in self.args.model_name_or_path.lower(): chosen_encodings_dict = self.tokenizer(prompt, prefix + chosen_answer, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_tensors="pt", return_token_type_ids=False) rejected_encodings_dict = self.tokenizer(prompt, prefix + rejected_answer, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_tensors="pt", return_token_type_ids=False) return { "chosen_input_ids": chosen_encodings_dict["input_ids"], "chosen_attention_mask": chosen_encodings_dict["attention_mask"], "rejected_input_ids": rejected_encodings_dict["input_ids"], "rejected_attention_mask": rejected_encodings_dict["attention_mask"], "labels": rejected_encodings_dict["input_ids"], } elif "chatglm3" in self.args.model_name_or_path.lower(): chosen_input_ids, labels, _ = chatglm3_encode(self.tokenizer, prompt, chosen_answer, system, self.args.max_length) rejected_input_ids, labels, _ = chatglm3_encode(self.tokenizer, prompt, rejected_answer, system, self.args.max_length) return { "chosen_input_ids": torch.tensor(chosen_input_ids, dtype=torch.long), "rejected_input_ids": torch.tensor(rejected_input_ids, dtype=torch.long), "labels": torch.tensor(labels, dtype=torch.long) } elif "chatglm2" in self.args.model_name_or_path.lower(): chosen_input_ids, labels, _ = chatglm2_encode(self.tokenizer, prompt, chosen_answer, system, self.args.max_length) rejected_input_ids, labels, _ = chatglm2_encode(self.tokenizer, prompt, rejected_answer, system, self.args.max_length) return { "chosen_input_ids": torch.tensor(chosen_input_ids, dtype=torch.long), "rejected_input_ids": torch.tensor(rejected_input_ids, dtype=torch.long), "labels": torch.tensor(labels, dtype=torch.long) } elif "chatglm" in self.args.model_name_or_path.lower(): prompt = f"[Round {0}]\n问:{prompt}\n答:" chosen_encodings_dict = self.tokenizer(prompt, chosen_answer, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_tensors="pt") rejected_encodings_dict = self.tokenizer(prompt, rejected_answer, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_tensors="pt") return { "chosen_input_ids": chosen_encodings_dict["input_ids"][0], "rejected_input_ids": rejected_encodings_dict["input_ids"][0], "labels": rejected_encodings_dict["input_ids"][0], } elif "glm" in self.args.model_name_or_path.lower(): chosen_prompt_length = len(self.tokenizer.tokenize(prompt + prefix)) + 4 rejected_prompt_length = chosen_prompt_length chosen_answer_length = len(self.tokenizer.tokenize(chosen_answer)) + 1 if chosen_prompt_length + chosen_answer_length > self.args.max_length: if chosen_prompt_length >= chosen_answer_length: chosen_prompt_length -= chosen_prompt_length + chosen_answer_length - self.args.max_length else: chosen_answer_length -= chosen_prompt_length + chosen_answer_length - self.args.max_length else: chosen_answer_length = self.args.max_length - chosen_prompt_length chosen_encoded_dict = self.tokenizer(prompt, prefix + self.tokenizer.mask_token, max_length=chosen_prompt_length, truncation="only_first", return_tensors="pt", return_token_type_ids=False) chosen_encodings_dict = self.tokenizer.build_inputs_for_generation(chosen_encoded_dict, targets=chosen_answer, max_gen_length=chosen_answer_length, padding=True) rejected_answer_length = len(self.tokenizer.tokenize(rejected_answer)) + 1 if rejected_prompt_length + rejected_answer_length > self.args.max_length: if rejected_prompt_length >= rejected_answer_length: rejected_prompt_length -= rejected_prompt_length + rejected_answer_length - self.args.max_length else: rejected_answer_length -= rejected_prompt_length + rejected_answer_length - self.args.max_length else: rejected_answer_length = self.args.max_length - rejected_prompt_length rejected_encoded_dict = self.tokenizer(prompt, prefix + self.tokenizer.mask_token, max_length=rejected_prompt_length, truncation="only_first", return_tensors="pt", return_token_type_ids=False) rejected_encodings_dict = self.tokenizer.build_inputs_for_generation(rejected_encoded_dict, targets=rejected_answer, max_gen_length=rejected_answer_length, padding=True) return { "chosen_input_ids": chosen_encodings_dict["input_ids"][0], "chosen_attention_mask": chosen_encodings_dict["attention_mask"][0], "chosen_position_ids": chosen_encodings_dict["position_ids"][0], "rejected_input_ids": rejected_encodings_dict["input_ids"][0], "rejected_attention_mask": rejected_encodings_dict["attention_mask"][0], "rejected_position_ids": rejected_encodings_dict["position_ids"][0], "labels": rejected_encodings_dict["input_ids"][0], } else: raise ValueError(f"Unsupported model name: {self.args.model_name_or_path}") @staticmethod def load_dataset(filename): discard = 0 pairs = [] with open(filename, "r", encoding="utf-8") as f: for line in tqdm(f, desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) prompt = str(item['prompt']) answers = item['answers'] prefix = item.get('prefix', "") system = item.get('system', "") chosen_answer, rejected_answer = None, None for i in range(len(answers) - 1): answer_1 = str(answers[i]["answer"]) answer_1_score = answers[i]["score"] answer_2 = str(answers[i + 1]["answer"]) answer_2_score = answers[i + 1]["score"] if answer_1_score > answer_2_score: chosen_answer = answer_1 rejected_answer = answer_2 if chosen_answer is not None and rejected_answer is not None \ and len(prompt) > 0 and len(chosen_answer) > 0 and len(rejected_answer) > 0 \ and chosen_answer != rejected_answer: pair = { "prompt": prompt, "prefix": prefix, "system": system, "chosen_answer": chosen_answer, "rejected_answer": rejected_answer } pairs.append(pair) else: discard += 1 print_rank_0(f"Finished loading {os.path.basename(filename)}, # pairs: {len(pairs)}, # discarded: {discard}") return pairs class RLHFDataset(Dataset): def __init__(self, args, filename, tokenizer): self.args = args self.tokenizer = tokenizer assert tokenizer.padding_side == "left", "In RLHF training, need to set padding_side to 'left'" self.post_list = self.load_dataset(filename) for k in range(5): print_rank_0(f"RLHFDataset sample-{k}\n: {self.post_list[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] prefix = data['prefix'] system = data['system'] if "pangu" in self.args.actor_model_path: encoded_dict = self.tokenizer(prompt, self.tokenizer.sep_token + prefix, max_length=self.args.max_prompt_length, # padding="max_length", truncation="only_first", add_special_tokens=False, return_tensors="pt", return_token_type_ids=False) return { "input_ids": encoded_dict['input_ids'][0], "attention_mask": encoded_dict['attention_mask'][0], # "labels": encoded_dict['input_ids'], } elif "chatglm" in self.args.actor_model_path: prompt = "\n\n".join((system, prompt)) prompt = f"[Round {1}]\n\n问:{prompt}\n\n答:" if "chatglm2" in self.args.actor_model_path else f"[Round {0}]\n问:{prompt}\n答:" encoded_dict = self.tokenizer(prompt, max_length=self.args.max_prompt_length, return_tensors="pt", truncation="only_first") return { "input_ids": encoded_dict['input_ids'][0], } elif "glm" in self.args.actor_model_path: # encoded_prompt = self.tokenizer(prompt, prefix + self.tokenizer.mask_token) # prompt_length = len(encoded_prompt['input_ids']) encoded_dict = self.tokenizer(prompt, prefix + self.tokenizer.mask_token, max_length=self.args.max_prompt_length, # padding="max_length", truncation="only_first", return_tensors="pt", return_token_type_ids=False) encoded_dict = self.tokenizer.build_inputs_for_generation(encoded_dict, max_gen_length=self.args.max_gen_length, padding=True) return { "input_ids": encoded_dict['input_ids'][0], "position_ids": encoded_dict['position_ids'][0], "generation_attention_mask": encoded_dict['generation_attention_mask'][0], # "labels": encoded_dict['labels'][0], } else: raise ValueError(f"Unsupported model name: {self.args.model_name_or_path}") @staticmethod def load_dataset(filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) data_type = item.get('data_type', "human_generated") if data_type != "human_generated": continue prompt = str(item['prompt']) prefix = item.get('prefix', "") system = item.get('system', "") if len(prompt) <= 0: discard += 1 continue datasets.append({"prompt": prompt, "system": system, "prefix": prefix}) print_rank_0( f"Finished loading {os.path.basename(filename)}, # samples: {len(datasets)}, # discarded: {discard}") return datasets class PPODataset: def __init__(self, max_size, small_batch_size): self.dataset = [] self.max_size = max_size self.small_batch_size = small_batch_size def separate(self): small_dataset = [] for large_batch in self.dataset: if type(large_batch) == list or type(large_batch) == tuple: large_size = len(large_batch[0]) elif type(large_batch) == dict: large_size = len(large_batch[list(large_batch.keys())[0]]) else: large_size = len(large_batch) for i in range(0, large_size, self.small_batch_size): if type(large_batch) == list or type(large_batch) == tuple: small_dataset.append( [x[i:i + self.small_batch_size] for x in large_batch]) elif type(large_batch) == dict: small_dataset.append({ k: v[i:i + self.small_batch_size] if v is not None else None for k, v in large_batch.items() }) else: small_dataset.append(large_batch[i:i + self.small_batch_size]) self.free() return small_dataset def add(self, data): if len(self.dataset) < self.max_size: self.dataset.append(data) if len(self.dataset) == self.max_size: return self.separate() else: return None else: raise ValueError( "The dataset is full but we did not stop it. There is a bug in the code." ) def free(self): self.dataset = [] class DPODataset(Dataset): def __init__(self, args, filename, tokenizer): self.pairs = self.load_dataset(filename) self.args = args self.tokenizer = tokenizer for k in range(5): print_rank_0(f"DPODataset sample-{k}\n: {self.pairs[k]}") def __len__(self): return len(self.pairs) def __getitem__(self, idx): pair = self.pairs[idx] index = pair["index"] prompt = pair["prompt"] chosen_answer = pair["chosen_answer"] rejected_answer = pair["rejected_answer"] prefix = pair['prefix'] system = pair['system'] if "pangu" in self.args.model_name_or_path.lower(): chosen_encodings_dict = self.tokenizer(prompt, prefix + chosen_answer, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_tensors="pt", return_token_type_ids=False) rejected_encodings_dict = self.tokenizer(prompt, prefix + rejected_answer, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_tensors="pt", return_token_type_ids=False) return { "chosen_input_ids": chosen_encodings_dict["input_ids"], "chosen_attention_mask": chosen_encodings_dict["attention_mask"], "rejected_input_ids": rejected_encodings_dict["input_ids"], "rejected_attention_mask": rejected_encodings_dict["attention_mask"], "labels": rejected_encodings_dict["input_ids"], } elif "chatglm3" in self.args.model_name_or_path.lower(): chosen_input_ids, chosen_labels, _ = chatglm3_encode(self.tokenizer, prompt, chosen_answer, system, self.args.max_length) rejected_input_ids, rejected_labels, _ = chatglm3_encode(self.tokenizer, prompt, rejected_answer, system, self.args.max_length) return { "index": torch.tensor(index, dtype=torch.long), "chosen_input_ids": torch.tensor(chosen_input_ids, dtype=torch.long), "rejected_input_ids": torch.tensor(rejected_input_ids, dtype=torch.long), "chosen_labels": torch.tensor(chosen_labels, dtype=torch.long), "rejected_labels": torch.tensor(rejected_labels, dtype=torch.long) } elif "chatglm2" in self.args.model_name_or_path.lower(): chosen_input_ids, chosen_labels, _ = chatglm2_encode(self.tokenizer, prompt, chosen_answer, system, self.args.max_length) rejected_input_ids, rejected_labels, _ = chatglm2_encode(self.tokenizer, prompt, rejected_answer, system, self.args.max_length) return { "index": torch.tensor(index, dtype=torch.long), "chosen_input_ids": torch.tensor(chosen_input_ids, dtype=torch.long), "rejected_input_ids": torch.tensor(rejected_input_ids, dtype=torch.long), "chosen_labels": torch.tensor(chosen_labels, dtype=torch.long), "rejected_labels": torch.tensor(rejected_labels, dtype=torch.long) } elif "chatglm" in self.args.model_name_or_path.lower(): prompt = f"[Round {0}]\n问:{prompt}\n答:" chosen_encodings_dict = self.tokenizer(prompt, chosen_answer, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_tensors="pt") rejected_encodings_dict = self.tokenizer(prompt, rejected_answer, max_length=self.args.max_length, truncation="longest_first", padding="max_length", return_tensors="pt") return { "chosen_input_ids": chosen_encodings_dict["input_ids"][0], "rejected_input_ids": rejected_encodings_dict["input_ids"][0], "labels": rejected_encodings_dict["input_ids"][0], } else: raise ValueError(f"Unsupported model name: {self.args.model_name_or_path}") @staticmethod def load_dataset(filename): discard = 0 index = 1 pairs = [] with open(filename, "r", encoding="utf-8") as f: for line in tqdm(f, desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) prompt = str(item['prompt']) answers = item['answers'] prefix = item.get('prefix', "") system = item.get('system', "") chosen_answer, rejected_answer = None, None for i in range(len(answers) - 1): answer_1 = str(answers[i]["answer"]) answer_1_score = answers[i]["score"] answer_2 = str(answers[i + 1]["answer"]) answer_2_score = answers[i + 1]["score"] if answer_1_score > answer_2_score: chosen_answer = answer_1 rejected_answer = answer_2 if chosen_answer is not None and rejected_answer is not None \ and len(prompt) > 0 and len(chosen_answer) > 0 and len(rejected_answer) > 0 \ and chosen_answer != rejected_answer: pair = { "index": index, "prompt": prompt, "prefix": prefix, "system": system, "chosen_answer": chosen_answer, "rejected_answer": rejected_answer } index += 1 pairs.append(pair) else: discard += 1 print_rank_0(f"Finished loading {os.path.basename(filename)}, # pairs: {len(pairs)}, # discarded: {discard}") return pairs class OCNLIDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.label_dict = {'entailment': 'Yes', 'neutral': 'Maybe', 'contradiction': 'No'} dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"OCNLIDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) s1 = item['sentence1'] s2 = item['sentence2'] label = item['label'] # 标注结果有冲突,则忽略 if label == "-": continue for l in self.label_dict.values(): prompt = f'{s1}?{l},{s2}' if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": self.label_dict[label]}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class CMNLIDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.label_dict = {'entailment': 'Yes', 'neutral': 'Maybe', 'contradiction': 'No'} dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"CMNLIDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") # label_dict = self.tokenizer(label, max_length=self.args.max_length, add_special_tokens=False, # return_attention_mask=False, return_token_type_ids=False, return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) s1 = item['sentence1'] s2 = item['sentence2'] label = item['label'] # 标注结果有冲突,则忽略 if label == "-": continue for l in self.label_dict.values(): prompt = f'{s1}?{l},{s2}' if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": self.label_dict[label]}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class CHIDDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.idiom_dict = self.load_idiom_dict() dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"CHIDDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] candidates = data['candidates'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") # label_dict = self.tokenizer(label, max_length=self.args.max_length, add_special_tokens=False, # return_attention_mask=False, return_token_type_ids=False, return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label, "candidates": candidates } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) candidates = item['candidates'] contents = item['content'] for content in contents: for idiom in re.findall(r"#idiom\d+#", content): label = candidates[self.idiom_dict[idiom]] for candidate in candidates: prompt = content.replace(idiom, candidate) if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": label, "candidates": candidates}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets def load_idiom_dict(self): idiom_dict = json.load(open(os.path.join(self.args.data_dir, "dev_answer.json"), "r", encoding="utf-8")) idiom_dict.update(json.load(open(os.path.join(self.args.data_dir, "train_answer.json"), "r", encoding="utf-8"))) print_rank_0(f"Finished loading idiom dict") return idiom_dict class CMRCDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"CMRCDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label } def load_dataset(self, filename): discard = 0 datasets = [] data = json.load(open(filename, "r", encoding="utf-8")) for paragraphs in data['data']: for paragraph in paragraphs['paragraphs']: context = paragraph['context'] for qs in paragraph['qas']: question = qs['question'] answers = [] [answers.append(answer) for answer in qs['answers'] if answer not in answers] prompt_template = "阅读文章:{context}\n问:{question}\n答:" prompt = prompt_template.format(context=context, question=question) if len(prompt) <= 0: continue # if len(prompt) > self.args.max_length: # idx = len(prompt) - self.args.max_length # prompt = prompt_template.format(context=context[:-idx], question=question) datasets.append({"prompt": prompt, "label": answers}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class CLUEWSCDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.label_dict = {'true': '1', 'false': '0'} dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"CLUEWSCDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") # label_dict = self.tokenizer(label, max_length=self.args.max_length, add_special_tokens=False, # return_attention_mask=False, return_token_type_ids=False, return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label, } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) text = item['text'] span2_index = item['target']['span2_index'] span2_text = item['target']['span2_text'] span1_text = item['target']['span1_text'] label = self.label_dict[item['label']] prompt = text[:span2_index] + span1_text + text[span2_index + len(span2_text):] if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": label}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class C3Dataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"C3Dataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] candidates = data['candidates'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") # label_dict = self.tokenizer(label, max_length=self.args.max_length, add_special_tokens=False, # return_attention_mask=False, return_token_type_ids=False, return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label, "candidates": candidates } def load_dataset(self, filename): discard = 0 datasets = [] data = json.load(open(filename, "r", encoding="utf-8")) for i, d in enumerate(data): context = "".join(d[0]) for qs in d[1]: question = qs['question'] choices = qs['choice'] choices_padded = [choices[i] if i < len(choices) else f"test{i}" for i in range(4)] answer = qs['answer'] for choice in choices: prompt = f"问: {question}\n答:{choice}\n该答案来自对话: {context}" if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": answer, "candidates": choices_padded}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class AFQMCDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.label_dict = {'0': '不同', '1': '相同'} dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"AFQMCDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") # label_dict = self.tokenizer(label, max_length=self.args.max_length, add_special_tokens=False, # return_attention_mask=False, return_token_type_ids=False, return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) s1 = item['sentence1'] s2 = item['sentence2'] label = self.label_dict[item['label']] for l in self.label_dict.values(): prompt = f'下面两个句子语义{l}:{s1}。{s2}' if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": label}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class CSLDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.label_dict = {'0': '不是', '1': '是'} dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"CSLDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") # label_dict = self.tokenizer(label, max_length=self.args.max_length, add_special_tokens=False, # return_attention_mask=False, return_token_type_ids=False, return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) abstract = item['abst'] keyword = "+".join(item['keyword']) label = self.label_dict[item['label']] for l in self.label_dict.values(): prompt = f'摘要:{abstract},关键词:{keyword}{l}真实关键词' if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": label}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class IFLYTEKDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.label_dict = {'0': '打车', '1': '地图导航', '2': '免费WIFI', '3': '租车', '4': '同城服务', '5': '快递物流', '6': '婚庆', '7': '家政', '8': '公共交通', '9': '政务', '10': '社区服务', '11': '薅羊毛', '12': '魔幻', '13': '仙侠', '14': '卡牌', '15': '飞行空战', '16': '射击游戏', '17': '休闲益智', '18': '动作类', '19': '体育竞技', '20': '棋牌中心', '21': '经营养成', '22': '策略', '23': 'MOBA', '24': '辅助工具', '25': '约会社交', '26': '即时通讯', '27': '工作社交', '28': '论坛圈子', '29': '婚恋社交', '30': '情侣社交', '31': '社交工具', '32': '生活社交', '33': '微博博客', '34': '新闻', '35': '漫画', '36': '小说', '37': '技术', '38': '教辅', '39': '问答交流', '40': '搞笑', '41': '杂志', '42': '百科', '43': '影视娱乐', '44': '求职', '45': '兼职', '46': '视频', '47': '短视频', '48': '音乐', '49': '直播', '50': '电台', '51': 'K歌', '52': '成人', '53': '中小学', '54': '职考', '55': '公务员', '56': '英语', '57': '视频教育', '58': '高等教育', '59': '成人教育', '60': '艺术', '61': '语言(非英语)', '62': '旅游资讯', '63': '综合预定', '64': '民航', '65': '铁路', '66': '酒店', '67': '行程管理', '68': '民宿短租', '69': '出国', '70': '工具', '71': '亲子儿童', '72': '母婴', '73': '驾校', '74': '违章', '75': '汽车咨询', '76': '汽车交易', '77': '日常养车', '78': '行车辅助', '79': '租房', '80': '买房', '81': '装修家居', '82': '电子产品', '83': '问诊挂号', '84': '养生保健', '85': '医疗服务', '86': '减肥瘦身', '87': '美妆美业', '88': '菜谱', '89': '餐饮店', '90': '体育咨讯', '91': '运动健身', '92': '支付', '93': '保险', '94': '股票', '95': '借贷', '96': '理财', '97': '彩票', '98': '记账', '99': '银行', '100': '美颜', '101': '影像剪辑', '102': '摄影修图', '103': '相机', '104': '绘画', '105': '二手', '106': '电商', '107': '团购', '108': '外卖', '109': '电影票务', '110': '社区超市', '111': '购物咨询', '112': '笔记', '113': '办公', '114': '日程管理', '115': '女性', '116': '经营', '117': '收款', '118': '其他'} dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"IFLYTEKDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] candidates = data['candidates'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") # label_dict = self.tokenizer(label, max_length=self.args.max_length, add_special_tokens=False, # return_attention_mask=False, return_token_type_ids=False, return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label, "candidates": candidates } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) content = item['sentence'] label = item['label_des'] # randomly sample 3 categories as negative sample labels = set(self.label_dict.values()) labels.remove(label) candidates = [label] + random.sample(labels, 3) for l in candidates: prompt = f'这是关于{l}的应用程序:{content}' if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": label, "candidates": candidates}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class TNEWSDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.label_dict = {'100': '故事', '101': '文化', '102': '娱乐', '103': '体育', '104': '财经', '106': '房产', '107': '汽车', '108': '教育', '109': '科技', '110': '军事', '112': '旅游', '113': '世界', '114': '股票', '115': '农业', '116': '游戏'} dataset = self.load_dataset(eval_filename) if train_filename is not None: self.labelled_list = self.load_dataset(eval_filename) self.post_list = dataset for k in range(5): print_rank_0(f"TNEWSDataset sample-{k}\n: {dataset[k]}") def __len__(self): return len(self.post_list) def __getitem__(self, idx): data = self.post_list[idx] prompt = data['prompt'] label = data['label'] candidates = data['candidates'] # Few-Shot example construction if hasattr(self, "labelled_list"): examples = random.sample(self.labelled_list, min(len(self.labelled_list), self.args.max_few_shot)) prompts = [] prompt_tokens = self.tokenizer.tokenize(prompt) for example in examples: example_prompt = example['prompt'] exmample_tokens = self.tokenizer.tokenize(example_prompt + "\n") if len(exmample_tokens) + len(prompt_tokens) + 2 > self.args.max_length: break else: prompts.append(example_prompt) prompt_tokens.extend(exmample_tokens) prompts.append(prompt) prompt = "\n".join(prompts) encoded_dict = self.tokenizer(prompt, max_length=self.args.max_length, padding="max_length", truncation="longest_first", return_tensors="pt") # label_dict = self.tokenizer(label, max_length=self.args.max_length, add_special_tokens=False, # return_attention_mask=False, return_token_type_ids=False, return_tensors="pt") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict["attention_mask"], "labels": encoded_dict["input_ids"], "label_str": label, "candidates": candidates } def load_dataset(self, filename): discard = 0 datasets = [] with open(filename, "r", encoding="utf-8") as f: for i, line in tqdm(enumerate(f), desc=f"Loading {os.path.basename(filename)}"): item = json.loads(line) content = item['sentence'] label = self.label_dict[item['label']] # randomly sample 3 categories as negative sample labels = set(self.label_dict.values()) labels.remove(label) candidates = [label] + random.sample(labels, 3) for l in candidates: prompt = f'这是关于{l}的文章:{content}' if len(prompt) <= 0: continue datasets.append({"prompt": prompt, "label": label, "candidates": candidates}) print_rank_0(f"Finished loading {os.path.basename(filename)}, # discarded: {discard}") return datasets class CEvalDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.model_name_or_path = args.model_name_or_path if hasattr(args, "model_name_or_path") else args.actor_model_path self.subject_mapping = json.load(open(os.path.join(RESOURCE_PATH, "eval", "ceval", "subject_mapping.json"))) self.max_length = args.max_length - args.max_length_generation self.choices = ["A", "B", "C", "D"] self.post_list = self.load_dataset(eval_filename) if train_filename is not None: self.dev_list = self.load_dataset(train_filename, "dict") for k in range(5): print_rank_0(f"CEvalDataset sample-{k}\n: {self.post_list[k]}") def __len__(self): return len(self.post_list) def format_example(self, line, include_answer=True, cot=False): example = line['question'] for choice in self.choices: example += f'\n{choice}. {line[f"{choice}"]}' example += '\n答案:' if "chatglm" in self.model_name_or_path.lower() or "qwen" in self.model_name_or_path.lower(): if include_answer: if cot: ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。" else: ans = line["answer"] m = (example, ans) return m return example else: # example = line['question'] # for choice in self.choices: # example += f'\n{choice}. {line[f"{choice}"]}' if include_answer: if cot: example += "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。" else: example += line["answer"] else: if cot: example += "让我们一步一步思考,\n1." return example def __getitem__(self, idx): data = self.post_list[idx] subject_name = data['subject_name'] question = self.format_example(data, include_answer=False, cot=self.args.cot) prefix = f"以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。" history = [] if "chatglm" in self.model_name_or_path.lower(): sep = "\n\n" if "chatglm2" in self.model_name_or_path.lower() else "\n" offset = 1 if "chatglm2" in self.model_name_or_path.lower() else 0 # Few-Shot example construction if hasattr(self, "dev_list"): history.append(prefix) k = self.args.max_few_shot dev_list = self.dev_list[subject_name] for i in range(min(k, len(dev_list))): prompt, answer = self.format_example(dev_list[i], include_answer=True, cot=self.args.cot) prompt = f"[Round {i + offset}]{sep}问:{prompt}{sep}答:{answer}" history.append(prompt) # Concat few-shot/zero-shot examples with question. # If length of full prompt exceeds max_length, remove examples until the length is smaller than max_length question = f"[Round {len(history) + offset}]{sep}问:{question}{sep}答:" while True: full_prompt = sep.join(history + [question]) input_ids = self.tokenizer.encode(full_prompt) if len(input_ids) <= self.max_length: break elif len(history) <= 1: full_prompt = question break else: history.pop(-1) encoded_dict = self.tokenizer(full_prompt, max_length=self.max_length, return_tensors="pt", truncation="longest_first") elif "qwen" in self.model_name_or_path.lower(): # Few-Shot example construction if hasattr(self, "dev_list"): k = self.args.max_few_shot dev_list = self.dev_list[subject_name] for i in range(min(k, len(dev_list))): history.append(self.format_example(dev_list[i], include_answer=True, cot=self.args.cot)) full_prompt, input_ids = qwen_make_context(self.tokenizer, question, history, system=prefix, max_window_size=self.max_length) encoded_dict = {"input_ids": torch.tensor(input_ids, dtype=torch.int64)} else: # Few-Shot example construction if hasattr(self, "dev_list"): history.append(prefix) k = self.args.max_few_shot dev_list = self.dev_list[subject_name] for i in range(min(k, len(dev_list))): history.append(self.format_example(dev_list[i], include_answer=True, cot=self.args.cot)) # Concat few-shot/zero-shot examples with question. # If length of full prompt exceeds max_length, remove examples until the length is smaller than max_length while True: full_prompt = "\n\n".join(history + [question]) input_ids = self.tokenizer.encode(full_prompt) if len(input_ids) <= self.max_length: break elif len(history) <= 1: full_prompt = question break else: history.pop(-1) encoded_dict = self.tokenizer(full_prompt, max_length=self.max_length, return_tensors="pt", truncation="longest_first") logger.debug(f"number of shots: {len(history) - 1}, full prompt: {full_prompt}") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict.get("attention_mask", None), "number_of_shots": max(len(history) - 1, 0), "id": data['id'], "subject_name_key": data['subject_name_key'], "answer": data.get('answer', None) } def load_dataset(self, filename, return_format="list"): datasets = list() if return_format == "list" else dict() dt = os.path.basename(filename) for subject_name_key, subject in self.subject_mapping.items(): subject_name = subject[1] if isinstance(datasets, dict): datasets[subject_name] = list() dev_file_path = os.path.join(filename, f'{subject_name_key}_{dt}.csv') dev_df = pd.read_csv(dev_file_path) for i, val in dev_df.iterrows(): d = val.to_dict() if isinstance(datasets, dict): datasets[subject_name].append(d) else: d['subject_name'] = subject_name d['subject_name_key'] = subject_name_key datasets.append(d) print_rank_0(f"Finished loading {dt} dataset") return datasets class MMLUDataset(Dataset): def __init__(self, args, eval_filename, tokenizer, train_filename=None): self.tokenizer = tokenizer self.args = args self.model_name_or_path = args.model_name_or_path if hasattr(args, "model_name_or_path") else args.actor_model_path self.subject_mapping = json.load(open(os.path.join(RESOURCE_PATH, "eval", "mmlu", "subject_mapping.json"))) self.choices = ["A", "B", "C", "D"] self.max_length = args.max_length - args.max_length_generation self.post_list = self.load_dataset(eval_filename) if train_filename is not None: self.dev_list = self.load_dataset(train_filename, "dict") for k in range(5): print_rank_0(f"MMLUDataset sample-{k}\n: {self.post_list[k]}") def __len__(self): return len(self.post_list) def format_example(self, line, include_answer=True): example = line['question'] for choice in self.choices: example += f'\n{choice}. {line[f"{choice}"]}' example += '\nAnswer:' if "chatglm" in self.model_name_or_path.lower() or "qwen" in self.model_name_or_path.lower(): if include_answer: ans = line["answer"] m = (example, ans) return m return example else: # example = line['question'] # for choice in self.choices: # example += f'\n{choice}. {line[f"{choice}"]}' if include_answer: example += line["answer"] return example def __getitem__(self, idx): data = self.post_list[idx] subject_name = data['subject_name'] question = self.format_example(data, include_answer=False) prefix = f"The following are multiple choice questions (with answers) about {subject_name}." history = [] if "chatglm" in self.model_name_or_path.lower(): sep = "\n\n" if "chatglm2" in self.model_name_or_path.lower() else "\n" offset = 1 if "chatglm2" in self.model_name_or_path.lower() else 0 # Few-Shot example construction if hasattr(self, "dev_list"): history.append(prefix) k = self.args.max_few_shot dev_list = self.dev_list[subject_name] for i in range(min(k, len(dev_list))): prompt, answer = self.format_example(dev_list[i], include_answer=True) prompt = f"[Round {i + offset}]{sep}问:{prompt}{sep}答:{answer}" history.append(prompt) # Concat few-shot/zero-shot examples with question. # If length of full prompt exceeds max_length, remove examples until the length is smaller than max_length question = f"[Round {len(history) + offset}]{sep}问:{question}{sep}答:" while True: full_prompt = sep.join(history + [question]) input_ids = self.tokenizer.encode(full_prompt) if len(input_ids) <= self.max_length: break elif len(history) <= 1: full_prompt = question break else: history.pop(-1) encoded_dict = self.tokenizer(full_prompt, max_length=self.max_length, return_tensors="pt", truncation="longest_first") elif "qwen" in self.model_name_or_path.lower(): # Few-Shot example construction if hasattr(self, "dev_list"): k = self.args.max_few_shot dev_list = self.dev_list[subject_name] for i in range(min(k, len(dev_list))): history.append(self.format_example(dev_list[i], include_answer=True)) full_prompt, input_ids = qwen_make_context(self.tokenizer, question, history, system=prefix, max_window_size=self.max_length) encoded_dict = {"input_ids": torch.tensor(input_ids, dtype=torch.int64)} else: # Few-Shot example construction if hasattr(self, "dev_list"): history.append(prefix) k = self.args.max_few_shot dev_list = self.dev_list[subject_name] for i in range(min(k, len(dev_list))): history.append(self.format_example(dev_list[i], include_answer=True)) # Concat few-shot/zero-shot examples with question. # If length of full prompt exceeds max_length, remove examples until the length is smaller than max_length while True: full_prompt = "\n\n".join(history + [question]) input_ids = self.tokenizer.encode(full_prompt) if len(input_ids) <= self.max_length: break elif len(history) <= 1: full_prompt = question break else: history.pop(-1) encoded_dict = self.tokenizer(full_prompt, max_length=self.max_length, return_tensors="pt", truncation="longest_first") return { "input_ids": encoded_dict["input_ids"], "attention_mask": encoded_dict.get("attention_mask", None), "number_of_shots": max(len(history) - 1, 0), "subject_name_key": data['subject_name_key'], "answer": data.get('answer', None) } def load_dataset(self, filename, return_format="list"): datasets = list() if return_format == "list" else dict() dt = os.path.basename(filename) for subject_name_key, subject in self.subject_mapping.items(): subject_name = subject[0] if isinstance(datasets, dict): datasets[subject_name] = list() dev_file_path = os.path.join(filename, f'{subject_name_key}_{dt}.csv') dev_df = pd.read_csv(dev_file_path, names=["question", "A", "B", "C", "D", "answer"]) for i, val in dev_df.iterrows(): d = val.to_dict() if isinstance(datasets, dict): datasets[subject_name].append(d) else: d['subject_name'] = subject_name d['subject_name_key'] = subject_name_key datasets.append(d) print_rank_0(f"Finished loading {dt} dataset") return datasets ================================================ FILE: src/data/data_types.py ================================================ from dataclasses import dataclass from typing import Iterable from torchtyping import TensorType @dataclass class PromptElement: """ Dataclass for a single prompt, containing its string and tokenized form. :param text: The prompt text. :type text: str :param tokens: The prompt tokens. Should be a long tensor :type tokens: torch.Tensor """ text: str tokens: TensorType["num_tokens"] @dataclass class PromptBatch: """ Batched PromptElement :param text: An iterable of prompt texts. :type text: Iterable[str] :param tokens: A long tensor batch of prompt tokens. :type tokens: torch.Tensor """ text: Iterable[str] tokens: TensorType["batch_size", "num_tokens"] @dataclass class AccelerateRLElement: """ Dataclass for RL elements, containing output tokens and rewards for each token. :param tokens: The output tokens. Should be a long tensor :type tokens: torch.Tensor :param rewards: The rewards for each token. Should be a float tensor of same size as tokens. :type rewards: torch.Tensor """ output_tokens: TensorType["output_size"] rewards: TensorType["output_size"] @dataclass class AccelerateRLBatchElement: """ Batched accelerate RL element :param tokens: Batches of long tensors of output tokens. :type tokens: torch.Tensor :param rewards: Batches of float tensors of rewards for each output token. :type rewards: torch.Tensor """ output_tokens: TensorType["batch_size", "output_size"] rewards: TensorType["batch_size", "output_size"] @dataclass class PPORLElement: """ :param query_tensor: The query tensor i.e. the prompt tokens. Should be a long tensor. :type query_tensor: torch.Tensor :param response_tensor: The response tensor i.e. the output tokens. Should be a long tensor. :type response_tensor: torch.Tensor :param logprobs: The log probabilities over the response tokens generated by the policy network (i.e. the autoregressive model). Should be a float tensor of same size as tokens. :type logprobs: torch.Tensor :param values: The values for each token generated from the value network or value head. Should be a float tensor of same size as tokens. :type values: torch.Tensor :param rewards: The rewards for each token outputted in response. Should be a float tensor of same size as tokens. :type rewards: torch.Tensor """ query_tensor: TensorType["query_size"] response_tensor: TensorType["response_size"] attention_mask: TensorType["query_size"] position_ids: TensorType["query_size"] logprobs: TensorType["response_size"] values: TensorType["response_size"] rewards: TensorType["response_size"] @dataclass class PPORLBatch: """ A batched version of the PPORLElement. See PPORLElement for more details on individual fields. :param query_tensors: A batch of query tensors. Should be a long tensor. :type query_tensors: torch.Tensor :param response_tensors: A batch of response tensors. Should be a long tensor. :type response_tensors: torch.Tensor :param logprobs: A batch of log probabilities from policy :type logprobs: torch.Tensor :param values: A batch of values from value network :type values: torch.Tensor :param rewards: A batch of rewards :type rewards: torch.Tensor """ query_tensors: TensorType["batch_size", "query_size"] response_tensors: TensorType["batch_size", "response_size"] attention_mask: TensorType["batch_size", "query_size"] position_ids: TensorType["batch_size", "query_size"] logprobs: TensorType["batch_size", "response_size"] values: TensorType["batch_size", "response_size"] rewards: TensorType["batch_size", "response_size"] ================================================ FILE: src/data/pipeline.py ================================================ import os import sys import time import json import torch from abc import abstractmethod from typing import Optional, Any, Callable, Dict, Iterable, List, Union from dataclasses import dataclass from torch.utils.data import DataLoader, Dataset from torch.nn.utils.rnn import pad_sequence from torchtyping import TensorType from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase from transformers.tokenization_utils_base import PaddingStrategy from src.data.data_types import PPORLBatch, PPORLElement from src.utils.config import TRLConfig # specifies a dictionary of architectures _DATAPIPELINE: Dict[str, any] = {} # registry @dataclass class GeneralElement: """ General element outputted by a data pipeline """ pass @dataclass class RLElement: """ Batch element for RL model """ state: Iterable[str] = None # Context/prompts action: TensorType["N"] = None # Tokens generated by model given prompts reward: float = None # Reward obtained for that generation @dataclass class BatchElement: """ General batch element for any transformer to use in its forward pass """ tokens: TensorType["BATCH", "SEQ_LEN"] masks: TensorType["BATCH", "SEQ_LEN"] @dataclass class GLMDataCollator: tokenizer: PreTrainedTokenizerBase padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None return_tensors: str = "pt" def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: # batch = self.tokenizer.pad( # features, # padding=self.padding, # max_length=self.max_length, # pad_to_multiple_of=self.pad_to_multiple_of, # return_tensors=self.return_tensors, # ) max_length = max(map(lambda x: x['input_ids'].shape[0], features)) input_ids_list = [] attention_mask_list = [] position_ids_list = [] labels_list = [] for feature in features: input_ids = feature['input_ids'] seq_length = input_ids.shape[0] # padding for GLM generation: cls_token_id + prompt_tokens + mask_token_id + [eos_token_id]*N + sop_token_id input_ids = torch.cat((input_ids[:-1], torch.tensor([self.tokenizer.pad_token_id]*(max_length-seq_length), dtype=input_ids.dtype), input_ids[-1:] ), dim=0) input_ids_list.append(input_ids) attention_mask_list.append(feature['generation_attention_mask']) position_ids_list.append(feature['position_ids']) if "labels" in feature: labels_list.append(feature['labels']) batch = { "input_ids": torch.stack(input_ids_list, dim=0), "generation_attention_mask": torch.stack(attention_mask_list, dim=0), "position_ids": torch.stack(position_ids_list, dim=0) } if len(labels_list) > 0: batch['labels'] = torch.stack(labels_list, dim=0) return batch def register_datapipeline(name): """Decorator used register a CARP architecture Args: name: Name of the architecture """ def register_class(cls, name): _DATAPIPELINE[name] = cls setattr(sys.modules[__name__], name, cls) return cls if isinstance(name, str): name = name.lower() return lambda c: register_class(c, name) cls = name name = cls.__name__ register_class(cls, name.lower()) return cls @register_datapipeline class BasePipeline(Dataset): def __init__(self, path: str = "dataset"): super().__init__() @abstractmethod def __getitem__(self, index: int) -> GeneralElement: pass @abstractmethod def __len__(self) -> int: pass @abstractmethod def create_loader( self, batch_size: int, shuffle: bool, prep_fn: Callable = None, num_workers: int = 0, ) -> DataLoader: """ Create a dataloader for the pipeline :param prep_fn: Typically a tokenizer. Applied to GeneralElement after collation. """ pass class BaseRolloutStore(Dataset): def __init__(self, capacity=-1): self.history: Iterable[Any] = None self.capacity = capacity @abstractmethod def push(self, exps: Iterable[Any]): """ Push experiences to rollout storage """ pass def __getitem__(self, index: int) -> RLElement: return self.history[index] def __len__(self) -> int: return len(self.history) @abstractmethod def create_loader( self, batch_size: int, shuffle: bool, prep_fn: Callable = None, num_workers: int = 0, ) -> DataLoader: """ Create a dataloader for the rollout store :param prep_fn: Applied to RLElement after collation (typically tokenizer) :type prep_fn: Callable """ pass @register_datapipeline class PanguPipeline(BasePipeline): def __init__(self, prompts: List[dict], config: TRLConfig, tokenizer: PreTrainedTokenizer): super().__init__() self.prompts = prompts self.tokenizer = tokenizer self.config = config self.max_prompt_length = config.train.seq_length - config.method.gen_kwargs["max_new_tokens"] def __len__(self): return len(self.prompts) def __getitem__(self, idx): data = self.prompts[idx] prompt = data['prompt'] prefix = data['prefix'] encoded_dict = self.tokenizer(prompt, self.tokenizer.sep_token + prefix, max_length=self.max_prompt_length, return_tensors="pt", truncation="only_first", # padding="max_length", add_special_tokens=False, return_token_type_ids=False) return { "input_ids": encoded_dict['input_ids'][0], "attention_mask": encoded_dict['attention_mask'][0], } def create_loader(self, batch_size: int, shuffle=False) -> DataLoader: return DataLoader(self, batch_size=batch_size, shuffle=shuffle) @register_datapipeline class GLMPipeline(BasePipeline): def __init__(self, prompts: List[dict], config: TRLConfig, tokenizer: PreTrainedTokenizer): super().__init__() self.prompts = prompts self.tokenizer = tokenizer # self.config = config self.max_generation_length = config.method.gen_kwargs["max_new_tokens"] self.max_prompt_length = config.train.seq_length - self.max_generation_length def __len__(self): return len(self.prompts) def __getitem__(self, idx): data = self.prompts[idx] prompt = data['prompt'] prefix = data['prefix'] inputs = self.tokenizer(prompt, prefix + self.tokenizer.mask_token, max_length=self.max_prompt_length, truncation="only_first", # padding="max_length", return_tensors="pt", return_token_type_ids=False) inputs_glm = self.tokenizer.build_inputs_for_generation(inputs, max_gen_length=self.max_generation_length, padding=True) return { "input_ids": inputs_glm['input_ids'][0], "position_ids": inputs_glm['position_ids'][0], "generation_attention_mask": inputs_glm['generation_attention_mask'][0] } def create_loader(self, batch_size: int, shuffle=False) -> DataLoader: # collate_fn = GLMDataCollator(self.tokenizer) return DataLoader(self, batch_size=batch_size, shuffle=shuffle)#, collate_fn=collate_fn) @register_datapipeline class ChatGLMPipeline(BasePipeline): def __init__(self, prompts: List[dict], config: TRLConfig, tokenizer: PreTrainedTokenizer): super().__init__() self.prompts = prompts self.tokenizer = tokenizer self.config = config self.max_prompt_length = config.train.seq_length - config.method.gen_kwargs["max_new_tokens"] def __len__(self): return len(self.prompts) def __getitem__(self, idx): data = self.prompts[idx] prompt = data['prompt'] encoded_dict = self.tokenizer(prompt, max_length=self.max_prompt_length, return_tensors="pt", truncation="only_first", padding="max_length") return { "input_ids": encoded_dict['input_ids'][0], # "attention_mask": encoded_dict['attention_mask'][0], } def create_loader(self, batch_size: int, shuffle=False) -> DataLoader: return DataLoader(self, batch_size=batch_size, shuffle=shuffle) class PPORolloutStorage(BaseRolloutStore): """ Rollout storage for training PPO """ def __init__(self, pad_token_id): super().__init__() self.pad_token_id = pad_token_id self.history: Iterable[PPORLElement] = [None] def push(self, exps: Iterable[PPORLElement]): self.history += exps def clear_history(self): self.history = [] def export_history(self, location: str): assert os.path.exists(location) fpath = os.path.join(location, f"epoch-{str(time.time())}.json") def exp_to_dict(exp): {k: v.cpu().tolist() for k, v in exp.__dict__.items()} data = [exp_to_dict(exp) for exp in self.history] with open(fpath, "w") as f: f.write(json.dumps(data, indent=2)) def __getitem__(self, index: int) -> PPORLElement: return self.history[index] def __len__(self) -> int: return len(self.history) def create_loader( self, batch_size: int, shuffle: bool, ) -> DataLoader: def collate_fn(elems: Iterable[PPORLElement]): return PPORLBatch( torch.stack([elem.query_tensor for elem in elems]), # # Left padding of already left-padded queries # pad_sequence( # [elem.query_tensor.flip(0) for elem in elems], # padding_value=self.pad_token_id, # batch_first=True, # ).flip(1), # Right pad the rest, to have a single horizontal query/response split pad_sequence( [elem.response_tensor for elem in elems], padding_value=self.pad_token_id, batch_first=True, ), torch.stack([elem.attention_mask for elem in elems]), [elem.position_ids for elem in elems], pad_sequence( [elem.logprobs for elem in elems], padding_value=0.0, batch_first=True, ), pad_sequence([elem.values for elem in elems], padding_value=0.0, batch_first=True), pad_sequence( [elem.rewards for elem in elems], padding_value=0.0, batch_first=True, ), ) return DataLoader(self, batch_size, shuffle=shuffle, collate_fn=collate_fn) ================================================ FILE: src/data_prepare.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import time import json import argparse import random import glob import pandas as pd import opencc from tqdm import tqdm from transformers import AutoTokenizer from src.utils.logger import logger from src.utils.file_utils import set_seed from src.utils.nlp_utils import clean_text def weibo_summary_comment(args, tokenizer): ''' 微博新闻+摘要+评论,每条评论有点赞数,可以根据点赞数构造reward训练集 :param args: :param tokenizer: :return: processed json list ''' t = time.time() fi = os.path.join(args.data_dir, "weibo_summary_comments_json.json") fo = os.path.join(args.output_dir, "weibo_summary_comments.jsonl") data = [] with open(fo, "w", encoding="utf-8") as w: with open(fi, "r", encoding="utf-8") as r: while True: line = r.readline() if not line: break item = json.loads(line.strip("\n")) article = item['article'].replace(" ", "") abstract = item['abstract'].replace(" ", "") prompt = f"新闻内容:{article} 摘要:{abstract}" prefix = "评论:" answers = [ { "answer": k.replace(" ", ""), "score": int(v) } for (k, v) in sorted(item['comments'], key=lambda x: (int(x[1]), len(x[0])), reverse=True) ] w.write(json.dumps({"prompt": prompt, "answers": answers}, ensure_ascii=False)+'\n') data.append({"prompt": prompt, "answers": answers, "prefix": prefix}) logger.info(f"length: {len(data)}, time taken: {time.time()-t} s") return data def couplets(args, tokenizer): ''' 对联数据(上联和下联),可以根据正确下联和负例下联,构造reward训练集 :param args: :param tokenizer: :return: processed json list ''' t1 = time.time() fi = os.path.join(args.data_dir, "couplets.txt") fo = os.path.join(args.output_dir, "couplets.jsonl") l2 = [] nexts = dict() with open(fi, "r", encoding="utf-8") as r: while True: line = r.readline() if not line: break line = line.strip("\n") idx = len(line) // 2 prompt = line[:idx] answer = line[idx+1:] prefix = "下联:" answers = [{"answer": answer, "score": 1}] l2.append({"prompt": f"上联:{prompt}", "answers": answers, "prefix": prefix}) length = len(answer) if length not in nexts: nexts[length] = list() nexts[length].append(answer) t2 = time.time() logger.info(f"length: {len(l2)}, # different lengths: {len(nexts)}, time taken: {t2-t1} s") data = [] with open(fo, "w", encoding="utf-8") as w: for l in tqdm(enumerate(l2), desc="Processing Couplets"): answer = l['answers'][0] length = len(answer['answer']) # 上下联长度一样 nexts_tmp = set(nexts[length]) nexts_tmp.remove(answer['answer']) nexts_tmp = set(nexts[length]).difference(set([answer['answer']])) # nexts_tmp.remove(answer['answer']) answers.extend([{"answer": fa, "score": 0} for fa in random.sample(nexts_tmp, 2)]) # 上下联长度不一样 keys = set(nexts.keys()) keys.remove(length) answers.extend([{"answer": random.choice(nexts[key]), "score": -1} for key in random.sample(keys, 2)]) # answers = sorted(answers, key=lambda x: x['score'], reverse=True) w.write(json.dumps({"prompt": l['prompt'], "answers": answers, "prefix": l['prefix']}, ensure_ascii=False)+'\n') data.append({"prompt": l['prompt'], "answers": answers, "prefix": l['prefix']}) # if i % 1000 == 0: # logger.info(f"{i} samples processed, time taken: {time.time()-t2} s") logger.info(f"length: {len(data)}, time taken: {time.time()-t2} s") return data def zhidao(args, tokenizer): ''' 百度知道的问答数据,每条问题有多个答案以及最佳答案,可以直接构造reward训练集 :param args: :param tokenizer: :return: processed json list ''' t = time.time() fp = os.path.join(args.data_dir, "zhidao", "*.csv") fo = os.path.join(args.output_dir, "zhidao.jsonl") data = [] with open(fo, "w", encoding="utf-8") as w: for fi in glob.glob(fp): df = pd.read_csv(fi).sort_values(by=["title", "is_best"], ascending=False) prev_title = None prev_prompt = None prefix = "答:" for _, val in df.iterrows(): if isinstance(val['question'], str) and val['question'] != val['title']: prompt = f"问题:{val['title']} 内容:{val['question']}" else: prompt = f"问题:{val['title']}" if prev_title is not None and prev_title == val['title']: answers.append({"answer": val['reply'], "score": val['is_best']}) else: if prev_title is not None: # l3.append({"prompt": prev_prompt, "answers": copy.deepcopy(answers)}) w.write(json.dumps({"prompt": prev_prompt, "answers": answers}, ensure_ascii=False)+'\n') data.append({"prompt": prev_prompt, "answers": answers}) answers = [{"answer": val['reply'], "score": val['is_best']}] prev_prompt = prompt prev_title = val['title'] # l3.append({"prompt": prev_prompt, "answers": copy.deepcopy(answers)}) w.write(json.dumps({"prompt": prev_prompt, "answers": answers, "prefix": prefix}, ensure_ascii=False)+'\n') data.append({"prompt": prev_prompt, "answers": answers, "prefix": prefix}) logger.info(f"finished processing {os.path.basename(fi)}") logger.info(f"length: {len(data)}, time taken: {time.time()-t} s") return data def chinese_classical(args, tokenizer): ''' 文言文和现代文的对照翻译,每条文言文有对应的现代文翻译,可以根据正确翻译或原文+负例翻译或原文,构造reward训练集 :param args: :param tokenizer: :return: processed json list ''' t1 = time.time() fp = os.path.join(args.data_dir, "Classical-Modern", "bitext", "*") fo = os.path.join(args.output_dir, "chinese_classical.jsonl") l3 = [] dicts = dict() for fi in glob.glob(fp): name = os.path.basename(fi) dicts[name] = {"古文": [], "现代文": []} with open(fi, "r", encoding="utf-8") as r: for i, line in enumerate(r): line = line.strip("\n") if line.startswith("古文"): p1 = line[3:] dicts[name]['古文'].append(p1) elif line.startswith("现代文"): p2 = line[4:] dicts[name]['现代文'].append(p2) elif p1 is not None and p2 is not None: pair = [("古文", p1), ("现代文", p2)] random.shuffle(pair) prompt = f"{pair[0][0]}:{pair[0][1]}" prefix = f"{pair[1][0]}:" answers = [{"answer": pair[1][1], "score": 1}] l3.append({"prompt": prompt, "answers": answers, "prefix": prefix, "name": name}) p1 = None p2 = None t2 = time.time() logger.info(f"length: {len(l3)}, # different names: {len(dicts)}, time taken: {t2-t1} s") data = [] with open(fo, "w", encoding="utf-8") as w: for l in tqdm(enumerate(l3), desc="Processing Chinese Classical-Modern"): name = l['name'] prompt = l['prompt'] prefix = l['prefix'] answer = l['answers'][0]['answer'] if prompt.startswith("古文"): answer_type = '现代文' else: answer_type = '古文' samples_tmp = set(dicts[name][answer_type]) samples_tmp.remove(answer) answers.extend([{"answer": fa, "score": 0} for fa in random.sample(samples_tmp, 2)]) keys = set(dicts.keys()) keys.remove(name) answers.extend([{"answer": random.choice(dicts[key][answer_type]), "score": -1} for key in random.sample(keys, 2)]) w.write(json.dumps({"prompt": prefix, "answers": answers}, ensure_ascii=False)+'\n') data.append({"prompt": prefix, "answers": answers}) # if i % 100 == 0: # logger.info(f"{i} samples processed, time taken: {time.time()-t2} s") logger.info(f"length: {len(data)}, time taken: {time.time()-t2} s") return data def chinese_poetry(args, tokenizer): ''' 四书五经、诗、词、曲等古文数据,每篇文章或每首诗有作者、题目和正文,可以根据正确体裁/作者+负例体裁/作者,构造reward训练集 :param args: :param tokenizer: :return: processed json list ''' converter = opencc.OpenCC('t2s.json') t1 = time.time() fp = [ # 四书五经 os.path.join(args.data_dir, "chinese-poetry", "lunyu", "lunyu.json"), os.path.join(args.data_dir, "chinese-poetry", "sishuwujing", "*.json"), # 古体诗 os.path.join(args.data_dir, "chinese-poetry", "caocaoshiji", "caocao.json"), os.path.join(args.data_dir, "chinese-poetry", "shijing", "shijing.json"), # 楚辞 os.path.join(args.data_dir, "chinese-poetry", "chuci", "chuci.json"), # 诗 os.path.join(args.data_dir, "chinese-poetry", "shi", "poet*.json"), # 词 os.path.join(args.data_dir, "chinese-poetry", "ci", "ci*.json"), os.path.join(args.data_dir, "chinese-poetry", "nalanxingde", "*.json"), os.path.join(args.data_dir, "chinese-poetry", "wudai", "huajianji", "*juan.json"), os.path.join(args.data_dir, "chinese-poetry", "wudai", "nantang", "poetrys.json"), # 曲 os.path.join(args.data_dir, "chinese-poetry", "yuanqu", "yuanqu.json"), ] fs = [each for f in fp for each in glob.glob(f)] l5 = [] dicts = dict() for fi in fs: lines = json.load(open(fi, "r", encoding="utf-8")) if isinstance(lines, dict): lines = [lines] for i, line in enumerate(lines): if "lunyu" in fi: author = "孔子" genre = "经书" title = line['chapter'] contents = "".join(line['paragraphs']) elif "daxue" in fi: author = "曾子" genre = "经书" title = "大学" contents = converter.convert("".join(line['paragraphs'])).replace("「", "“").replace("」", "”") elif "mengzi" in fi: author = "孟子" genre = "经书" title = converter.convert(line['chapter']) contents = converter.convert("".join(line['paragraphs'])).replace("「", "“").replace("」", "”") elif "zhongyong" in fi: author = "孔伋" genre = "经书" title = "中庸" contents = converter.convert("".join(line['paragraphs'])).replace("「", "“").replace("」", "”") elif "caocao" in fi: author = "曹操" genre = "古体诗" title = line['title'] contents = "".join(line['paragraphs']) elif "shijing" in fi: author = "诗经" genre = "古体诗" title = line['chapter'] + "-" + line['section'] + "-" + line['title'] contents = "".join(line['content']) elif "chuci" in fi: author = line['author'] genre = "楚辞" title = line['section'] + "-" + line['title'] contents = "".join(line['content']) elif "nalanxingde" in fi: author = line['author'] genre = "词" title = line['title'] contents = "".join(line['para']) elif "huajianci" in fi: author = line['author'] genre = "词" title = line['title'] contents = "".join(line['paragraphs']) elif "nantang" in fi: author = line['author'] genre = "词" title = line['title'] contents = "".join(line['paragraphs']) elif "yuanqu" in fi: author = line['author'] genre = "曲" title = line['title'] contents = "".join(line['paragraphs']) elif "shi" in fi: if len(line['paragraphs']) <= 0: continue author = converter.convert(line['author']) genre = "五言诗" if len(line['paragraphs'][0]) == 12 else "七言诗" title = converter.convert(line['title']) contents = converter.convert("".join(line['paragraphs'])) elif "ci" in fi: author = line['author'] genre = "词" title = line['rhythmic'] contents = "".join(line['paragraphs']) if genre not in dicts: dicts[genre] = dict() if author not in dicts[genre]: dicts[genre][author] = dict() quantifier = "篇" if genre in ["经书", "楚辞"] else "首" prompt = f"以{author}的风格,写一{quantifier}{genre},题为{title}" answers = [{"answer": contents, "score": 1}] l5.append({"prompt": prompt, "answers": answers, "genre": genre, "title": title, "author": author}) dicts[genre][author][title] = contents t2 = time.time() logger.info(f"length: {len(l5)}, # different lengths: {len(dicts)}, time taken: {t2-t1} s") data = [] fo = os.path.join(args.output_dir, "chinese_poetry.jsonl") with open(fo, "w", encoding="utf-8") as w: for l in tqdm(enumerate(l5), desc="Processing Chinese Poetry"): genre = l['genre'] author = l['author'] title = l['title'] prompt = l['prompt'] answers = l['answers'] # 同作者其他作品-2 titles_tmp = set(dicts[genre][author].keys()) titles_tmp.remove(title) if len(titles_tmp) > 0: t = random.choice(list(titles_tmp)) answers.append({"answer": dicts[genre][author][t], "score": 0}) # 同体裁其他作者其他作品-1 authors_tmp = set(dicts[genre].keys()) authors_tmp.remove(author) a = random.choice(list(authors_tmp)) t = random.choice(list(dicts[genre][a].keys())) answers.append({"answer": dicts[genre][a][t], "score": -1}) # 不同体裁作品-0 genres_tmp = set(dicts.keys()) genres_tmp.remove(genre) g = random.choice(list(genres_tmp)) a = random.choice(list(dicts[g].keys())) t = random.choice(list(dicts[g][a].keys())) answers.append({"answer": dicts[g][a][t], "score": -2}) w.write(json.dumps({"prompt": prompt, "answers": answers, "prefix": ""}, ensure_ascii=False)+'\n') data.append({"prompt": prompt, "answers": answers, "prefix": ""}) logger.info(f"length: {len(data)}, time taken: {time.time()-t2} s") return data def baike_qa_2019(args, tokenizer): ''' 百科问答数据集,每个问题只有一个答案,可构造sft训练集 :param args: :param tokenizer: :return: processed json list ''' fs = glob.glob(os.path.join(args.data_dir, "baike_qa2019", "baike_qa_*.json")) fo = os.path.join(args.output_dir, "baike_qa.jsonl") data = [] t = time.time() with open(fo, "w", encoding="utf-8") as w: for f in fs: with open(f, "r", encoding="utf-8") as r: while True: line = r.readline() if not line: break item = json.loads(line.strip("\n")) question = clean_text(item['title'] if len(item['title']) > len(item['desc']) else item['desc']) prompt = question prefix = "答:" answer = clean_text(item['answer']) answers = [{"answer": answer, "score": 1}] w.write(json.dumps({"prompt": prompt, "answers": answers, "prefix": prefix}, ensure_ascii=False)+'\n') data.append({"prompt": prompt, "answers": answers, "prefix": prefix}) logger.info(f"length: {len(data)}, time taken: {time.time()-t} s") return data def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--model_name_or_path", type=str, required=True) parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() return args def main(): args = get_parser() logger.info(f"Parameters: {args}") # set random seed set_seed(args.seed) # load tokenizer tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_cache=False, trust_remote_code=True) # process raw datasets data_weibo = weibo_summary_comment(args, tokenizer) data_couplets = couplets(args, tokenizer) data_zhidao = zhidao(args, tokenizer) data_chinese_classical = chinese_classical(args, tokenizer) data_chinese_poetry = chinese_poetry(args, tokenizer) data_baike = baike_qa_2019(args, tokenizer) # merge processed datasets data = data_weibo + data_couplets + data_zhidao + data_chinese_classical + data_chinese_poetry + data_baike random.shuffle(data) fo = os.path.join(args.output_dir, "train_data_external_v1.jsonl") with open(fo, "w", encoding="utf-8") as w: for d in data: w.write(json.dumps(d, ensure_ascii=False)+'\n') fo = os.path.join(args.output_dir, "dev_data_external_v1.jsonl") with open(fo, "w", encoding="utf-8") as w: for d in data[:10000]: w.write(json.dumps(d, ensure_ascii=False)+'\n') logger.info("Finished saving processed train & dev files") if __name__ == "__main__": main() ================================================ FILE: src/eval_pretrain.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import argparse import json import numpy as np import torch import collections from tqdm import tqdm from torch.utils.data import DataLoader, SequentialSampler from torchmetrics.text.perplexity import Perplexity from transformers.generation.logits_process import LogitsProcessor from transformers.generation.utils import LogitsProcessorList from src.data.data import ( OCNLIDataset, CMNLIDataset, CHIDDataset, CMRCDataset, CLUEWSCDataset, C3Dataset, AFQMCDataset, CSLDataset, IFLYTEKDataset, TNEWSDataset, CEvalDataset, MMLUDataset, ) from src.utils import RESOURCE_PATH, load_tokenizer_and_model, load_checkpoint from src.utils.file_utils import set_seed, print_rank_0 DATASET = { "ceval": CEvalDataset, "mmlu": MMLUDataset, # NLI "ocnli": OCNLIDataset, "cmnli": CMNLIDataset, # Cloze and completion "chid": CHIDDataset, # MRC "cmrc2018": CMRCDataset, # Winograd "cluewsc2020": CLUEWSCDataset, # common sense reasoning "c3": C3Dataset, # Text Classification "tnews": TNEWSDataset, "iflytek": IFLYTEKDataset, "afqmc": AFQMCDataset, "csl": CSLDataset } def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--model_name_or_path", type=str, required=True) parser.add_argument("--task", type=str, required=True) parser.add_argument("--seed", type=int, default=42) # parser.add_argument("--local_rank", type=int, default=0) # parser.add_argument("--multi_card", action="store_true") parser.add_argument("--bits", type=int, default=16) parser.add_argument("--device_map", type=str, default=None, help="device map to allocate model," "[None] means cpu" "[0, 1, 2, ...] numbers mean single-card" "[auto, balanced, balanced_low_0] means multi-card") parser.add_argument("--max_length", type=int, default=2048) parser.add_argument("--max_length_generation", type=int, default=1, help="Maximum number of newly generated tokens") parser.add_argument("--checkpoint", type=str) parser.add_argument("--low_cpu_mem_usage", action="store_true", help="whether to enable low cpu memory usage" "when loading model") # eval parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--submission_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--max_few_shot", type=int, default=15, help="Maximum number of examples in few-shot evaulation") parser.add_argument("--cot", action="store_true", help="Whether to use Chain of Thought in evaluation") parser.add_argument("--do_sample", action="store_true") parser.add_argument("--num_return_sequences", type=int, default=1) parser.add_argument("--top_k", type=int, default=10) parser.add_argument("--top_p", type=float, default=0.8) parser.add_argument("--temperature", type=float, default=0.8) parser.add_argument("--repetition_penalty", type=float, default=1.0) args = parser.parse_args() return args def extract_cot_answer(line, response): #TODO: to be implemented pass def main(): args = get_parser() print_rank_0(f"Parameters: {args}") set_seed(args.seed) # load model and tokenizer tokenizer, model, eos_token_id = load_tokenizer_and_model(args) if args.checkpoint is not None: suffix = args.checkpoint.split(os.sep)[-2] + "_" load_checkpoint(args, model) else: suffix = "" print_rank_0(f"Finished loading model and tokenizer") # Set up the datasets dataset = DATASET.get(args.task, None) if dataset is None: raise ValueError(f"Unsupported task: {args.task}") train_filename = os.path.join(args.data_dir, args.train_filename) if args.train_filename is not None else None dev_dataset = dataset(args, os.path.join(args.data_dir, args.eval_filename), tokenizer, train_filename) # Set up the metric perplexity = Perplexity(ignore_index=tokenizer.pad_token_id) def preprocess_logits_for_metrics(logits, labels): labels = labels.detach().cpu() probs = torch.softmax(logits, dim=-1).detach().cpu().to(torch.float32) ppls = [] for i in range(probs.shape[0]): ppl = perplexity(probs[i:i+1], labels[i:i+1]) ppls.append(ppl) return torch.stack(ppls) def calculate_f1(pred_text, label_text): pred_tokens = tokenizer(pred_text, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False, return_tensors="pt")['input_ids'][0].tolist() label_tokens = tokenizer(label_text, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False, return_tensors="pt")['input_ids'][0].tolist() common = collections.Counter(pred_tokens) & collections.Counter(label_tokens) num_same = sum(common.values()) if len(pred_tokens) == 0 or len(label_tokens) == 0: return int(pred_tokens == label_tokens) if num_same == 0: return 0 precision = num_same / len(pred_tokens) recall = num_same / len(label_tokens) f1 = (2 * precision * recall) / (precision + recall) return f1 device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu" model.eval() if args.train_filename is None: output_filename = os.path.join(args.output_dir, f"{args.task}_{args.eval_filename}_zero-shot_{args.max_length}_{suffix}eval_result.jsonl") else: assert args.max_few_shot > 0 output_filename = os.path.join(args.output_dir, f"{args.task}_{args.eval_filename}_{args.max_few_shot}-shot_{args.max_length}_{suffix}eval_result.jsonl") if args.task in ["cmrc2018"]: # text_generator = TextGenerationPipeline(model, tokenizer, device=device) ems = [] f1s = [] with open(output_filename, "w", encoding="utf-8") as w: with torch.no_grad(): for dev_data in tqdm(dev_dataset.post_list, desc="Generation"): prompt = dev_data['prompt'] label = dev_data['label'] if "glm" in args.model_name_or_path.lower(): prompt += tokenizer.mask_token inputs = tokenizer(prompt, return_tensors="pt") inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=args.max_length + args.max_length_generation) inputs = inputs.to(device) outputs = model.generate(**inputs, max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=False, num_return_sequences=args.num_return_sequences, top_p=args.top_p, temperature=args.temperature) else: inputs = tokenizer(prompt, add_special_tokens=False, return_token_type_ids=False, return_tensors="pt") inputs = inputs.to(device) outputs = model.generate(**inputs, max_new_tokens=args.max_length_generation, pad_token_id=tokenizer.pad_token_id, do_sample=False, num_return_sequences=args.num_return_sequences, top_p=args.top_p, temperature=args.temperature) # outputs = text_generator(prompt, max_length=args.max_length_generation, # do_sample=True, num_return_sequences=args.num_return_sequences, # top_p=args.top_p, temperature=args.temperature) # results = [output['generated_text'].split("答:", maxsplit=1)[1].replace(tokenizer.eos_token, "").replace(tokenizer.pad_token, "") for output in outputs] results = tokenizer.batch_decode(outputs, skip_special_tokens=True) results = [result.split("答:", maxsplit=1)[1] for result in results] # metrics calculation em_max = -1 f1_max = -1 for l in label: for pred_text in results: label_text = l['text'] em = 1 if pred_text == label_text else 0 f1 = calculate_f1(pred_text, label_text) w.write(json.dumps({"prompt": prompt, "label": label_text, "pred": pred_text, "em": em, "f1": f1}, ensure_ascii=False)+"\n") if em > em_max: em_max = em if f1 > f1_max: f1_max = f1 ems.append(em_max) f1s.append(f1_max) print_rank_0(f"em={np.mean(ems)}, f1={np.mean(f1s)}") elif args.task in ["ceval"]: results = dict() with torch.no_grad(): for dev_data in tqdm(dev_dataset, desc="C-Eval Evaluation"): subject_name_key = dev_data['subject_name_key'] if subject_name_key not in results: results[subject_name_key] = list() if "chatglm" in args.model_name_or_path.lower(): logits_processor = LogitsProcessorList() if "chatglm2" in args.model_name_or_path.lower(): class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): scores.zero_() scores[..., 5] = 5e4 return scores else: class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): scores.zero_() scores[..., 20005] = 5e4 return scores logits_processor.append(InvalidScoreLogitsProcessor()) input_ids = dev_data['input_ids'].to(device) outputs = model.generate(input_ids=input_ids, max_new_tokens=args.max_length_generation, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty, logits_processor=logits_processor, output_scores=not args.cot, return_dict_in_generate=not args.cot) elif "qwen" in args.model_name_or_path.lower(): input_ids = dev_data['input_ids'].to(device) outputs = model.generate(input_ids=input_ids, max_new_tokens=args.max_length_generation, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty, output_scores=not args.cot, return_dict_in_generate=not args.cot) else: input_ids = dev_data['input_ids'].to(device) attention_mask = dev_data['attention_mask'].to(device) outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=args.max_length_generation, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty, output_scores=not args.cot, return_dict_in_generate=not args.cot) # output processing and answer extraction if args.cot: outputs = outputs['sequences'].tolist()[0][len(input_ids["input_ids"][0]):] response = tokenizer.decode(outputs) # response, _ = model.chat(tokenizer, dev_data['question'], history=dev_data['history'], # do_sample=False, ) response = response.strip() # ans, direct_extract = extract_cot_answer(dev_data, response) else: logits = outputs['scores'][0].flatten() pred = torch.tensor( [ logits[tokenizer.encode("A", add_special_tokens=False)[0]], logits[tokenizer.encode("B", add_special_tokens=False)[0]], logits[tokenizer.encode("C", add_special_tokens=False)[0]], logits[tokenizer.encode("D", add_special_tokens=False)[0]], ] ).argmax().detach().cpu().tolist() pred = {0: "A", 1: "B", 2: "C", 3: "D"}[pred] # correct = 1 if pred == label else 0 results[subject_name_key].append((dev_data['id'], dev_data['answer'], pred)) # metrics calculation subject_mapping = json.load(open(os.path.join(RESOURCE_PATH, "eval", "ceval", "subject_mapping.json"))) with open(output_filename, "w", encoding="utf-8") as w: result_dict = dict() acc_dict = dict() for subject_name_key, vals in results.items(): if subject_name_key not in result_dict: result_dict[subject_name_key] = dict() domain = subject_mapping[subject_name_key][2] if domain not in acc_dict: acc_dict[domain] = {"ct": 0, "correct": 0} for id_, label, pred in vals: result_dict[subject_name_key][str(id_)] = pred acc_dict[domain]['correct'] += 1 if pred == label else 0 acc_dict[domain]['ct'] += 1 w.write(json.dumps({"subject_name_key": subject_name_key, "id": id_, "pred": pred, "label": label}, ensure_ascii=False)+"\n") # if submission file is not none, then there is no label to calculate accuracy if args.submission_filename is not None: json.dump(result_dict, open(os.path.join(args.output_dir, args.submission_filename), "w", encoding="utf-8"), ensure_ascii=False) print_rank_0(f"Finished saving C-Eval Evaluation Result") else: ct = 0 correct = 0 for domain, val in acc_dict.items(): ct += val['ct'] correct += val['correct'] print_rank_0(f"[C-Eval Evaluation Result] domain: {domain}, acc: {val['correct'] / val['ct']}") print_rank_0(f"[C-Eval Evaluation Result] total acc: {correct / ct}") elif args.task in ["mmlu"]: results = dict() with torch.no_grad(): for dev_data in tqdm(dev_dataset, desc="MMLU Evaluation"): subject_name_key = dev_data['subject_name_key'] if subject_name_key not in results: results[subject_name_key] = list() if "chatglm" in args.model_name_or_path.lower(): logits_processor = LogitsProcessorList() if "chatglm2" in args.model_name_or_path.lower(): class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): scores.zero_() scores[..., 5] = 5e4 return scores else: class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): scores.zero_() scores[..., 20005] = 5e4 return scores logits_processor.append(InvalidScoreLogitsProcessor()) input_ids = dev_data['input_ids'].to(device) outputs = model.generate(input_ids=input_ids, max_new_tokens=args.max_length_generation, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty, logits_processor=logits_processor, output_scores=True, return_dict_in_generate=True) elif "qwen" in args.model_name_or_path.lower(): input_ids = dev_data['input_ids'].to(device) outputs = model.generate(input_ids=input_ids, max_new_tokens=args.max_length_generation, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty, output_scores=True, return_dict_in_generate=True) else: input_ids = dev_data['input_ids'].to(device) attention_mask = dev_data['attention_mask'].to(device) outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=args.max_length_generation, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty, output_scores=True, return_dict_in_generate=True) # output processing and answer extraction logits = outputs['scores'][0].flatten() pred = torch.tensor( [ logits[tokenizer.encode("A", add_special_tokens=False)[0]], logits[tokenizer.encode("B", add_special_tokens=False)[0]], logits[tokenizer.encode("C", add_special_tokens=False)[0]], logits[tokenizer.encode("D", add_special_tokens=False)[0]], ] ).argmax().detach().cpu().tolist() pred = {0: "A", 1: "B", 2: "C", 3: "D"}[pred] # correct = 1 if pred == label else 0 results[subject_name_key].append((dev_data['answer'], pred)) # metrics calculation subject_mapping = json.load(open(os.path.join(RESOURCE_PATH, "eval", "mmlu", "subject_mapping.json"))) with open(output_filename, "w", encoding="utf-8") as w: acc_dict = dict() for subject_name_key, vals in results.items(): domain = subject_mapping[subject_name_key][1] if domain not in acc_dict: acc_dict[domain] = {"ct": 0, "correct": 0} for label, pred in vals: # result_dict[subject_name_key] = pred acc_dict[domain]['correct'] += 1 if pred == label else 0 acc_dict[domain]['ct'] += 1 w.write(json.dumps({"subject_name_key": subject_name_key, "pred": pred, "label": label}, ensure_ascii=False)+"\n") ct = 0 correct = 0 for domain, val in acc_dict.items(): ct += val['ct'] correct += val['correct'] print_rank_0(f"[MMLU Evaluation Result] domain: {domain}, acc: {val['correct'] / val['ct']}") print_rank_0(f"[MMLU Evaluation Result] total acc: {correct / ct}") else: sampler = SequentialSampler(dev_dataset) dev_dataloader = DataLoader(dev_dataset, sampler=sampler, batch_size=args.eval_batch_size) ppl_list = [] input_ids_list = [] label_list = [] ls_list = [] with torch.no_grad(): for batch in tqdm(dev_dataloader, desc="Evaluation"): input_ids = batch['input_ids'].squeeze(1).to(device) attention_mask = batch['attention_mask'].squeeze(1).to(device) labels = batch['labels'].squeeze(1).to(device) out = model(input_ids, attention_mask=attention_mask) ppls = preprocess_logits_for_metrics(out.logits, labels) input_ids_list.extend(batch['input_ids'].detach().cpu().tolist()) ppl_list.extend(ppls.detach().cpu().tolist()) label_list.extend(batch['label_str']) if args.task in ['chid', 'c3', 'iflytek', 'tnews']: ls = np.array(batch['candidates']).transpose().tolist() ls_list.extend(ls) else: vals = list(dev_dataset.label_dict.values()) ls_list.extend([vals]*input_ids.shape[0]) ct = 0 ct_acc = 0 ppls = [] with open(output_filename, "w", encoding="utf-8") as w: for i, (input_ids, label, ls, ppl) in enumerate(zip(input_ids_list, label_list, ls_list, ppl_list)): ppls.append(ppl) prompt = tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0] if i % len(ls) == len(ls) - 1: lidx = ls.index(label) if np.argmin(ppls) == lidx: ct_acc += 1 ct += 1 # cur_label = None ppls = [] w.write(json.dumps({"prompt": prompt, "pred": float(ppl), "label": label}, ensure_ascii=False) + "\n") print_rank_0(f"ppl={ct_acc/ct}") if __name__ == "__main__": main() ================================================ FILE: src/models/__init__.py ================================================ from src.models.reward import RewardModel from src.models.loss import PairWiseLoss # from src.models.llama import LlamaForCausalLM, LlamaModel, LlamaPreTrainedModel ================================================ FILE: src/models/loss.py ================================================ import torch import torch.nn as nn class PairWiseLoss(nn.Module): """ Pairwise Loss for Reward Model """ def forward(self, chosen_reward: torch.Tensor, reject_reward: torch.Tensor) -> torch.Tensor: probs = torch.sigmoid(chosen_reward - reject_reward) log_probs = torch.log(probs) loss = -log_probs.mean() return loss ================================================ FILE: src/models/ppo.py ================================================ # Copyright 2022 CarperAI & The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # NOTE: This file contains a modified version of the `PreTrainedModelWrapper` class from # HuggingFace's `trl` library. The original source code can be found here: # https://github.com/lvwerra/trl/blob/78c13226bf8ea1ccd9b1c091f03a938098521f6c/trl/models/modeling_base.py import gc import inspect import json import os import numpy as np import torch import torch.nn as nn import transformers from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple, Union from dataclasses import dataclass from transformers.modeling_outputs import ModelOutput from transformers.models.bloom import modeling_bloom from transformers.models.opt import modeling_opt from huggingface_hub import hf_hub_download from peft import ( LoraConfig, get_peft_model ) from src.utils.modeling_utils import ( hf_get_decoder, hf_get_decoder_blocks, hf_get_decoder_final_norm, hf_get_hidden_size, hf_get_lm_head, hf_get_num_hidden_layers, make_head ) class PreTrainedModelWrapper(nn.Module, transformers.utils.PushToHubMixin): """A wrapper around `transformers.PreTrainedModel` Reference: @younesbelkada's `PreTrainedModelWrapper` https://github.com/lvwerra/trl/blob/4f5c16fafde42d9aca971952bcdcc1f5a0a68cf0/trl/models/modeling_base.py#L2 Attributes: _auto_model_parent_class (transformers.AutoModel): The `transformers.AutoModel` type to base the wrapping behavior off of, e.g. `transformers.AutoModelForCausalLM`. _supported_modules (List[str]): A list of attribute names for modules of the underlying architecture model. This is used, for example, to save and load any additional modules by manipulating the state dict. _supported_args (List[str]): A list of arguments specific to the underlying architecture to separate from arguments that are supported by the parent `AutoModel` class. Any arguments that are not supported by the underlying model will be passed to the parent `AutoModel` class. """ _auto_model_parent_class: transformers.AutoModel = None _supported_modules: List[str] = None # TODO (jon-tow): Supported args should come from a `PretrainedConfig` of the # specific underlying type similar to how config instances can be used to instantiate # `transformers.PreTrainedModel`s. _supported_args: List[str] = None def __init__(self, base_model: Optional[transformers.PreTrainedModel] = None, **kwargs): super().__init__() self.base_model = base_model # cache `forward` args for general use (avoids incompatible args across architectures) self.forward_kwargs = inspect.getfullargspec(self.base_model.forward).args @classmethod def _split_kwargs(cls, kwargs: Dict[str, Any]): """Separates the kwargs from the supported arguments within `supported_args` and those that are not """ supported_kwargs = {} unsupported_kwargs = {} for key, value in kwargs.items(): if key in cls._supported_args: supported_kwargs[key] = value else: unsupported_kwargs[key] = value return supported_kwargs, unsupported_kwargs @classmethod def from_config(cls, config: transformers.PretrainedConfig, **kwargs): """Instantiate the pretrained pytorch model from a configuration. Args: config (transformers.PretrainedConfig): The configuration to use to instantiate the base model. NOTE: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use `~transformers.AutoModel.from_pretrained` to load the model weights. """ if kwargs is not None: wrapped_model_kwargs, from_config_kwargs = cls._split_kwargs(kwargs) else: from_config_kwargs = {} wrapped_model_kwargs = {} base_model = cls._auto_model_parent_class.from_config(config, **from_config_kwargs) model = cls(base_model, **wrapped_model_kwargs) return model @classmethod def from_pretrained( # noqa: max-complexity cls, pretrained_model_name_or_path: Union[str, transformers.PreTrainedModel], *model_args, **kwargs, ): """Instantiate a pretrained pytorch model from a pretrained model configuration. This method is a wrapper around `transformers.PreTrainedModel.from_pretrained`. Please refer to the documentation of `transformers.PreTrainedModel.from_pretrained` for more information. Args: pretrained_model_name_or_path (str or `transformers.PreTrainedModel`): The identifier of the pretrained model to load or the pretrained model itself. *model_args (sequence of positional arguments, *optional*): All remaining positional arguments will be passed to the `_auto_model_parent_class`. **kwargs (dict, *optional*): Dictionary of keyword arguments to pass to both the underlying `_auto_model_parent_class` call (e.g. `transformers.AutoModelForCausalLM.from_pretrained`) and the specific instance of the wrapped model. NOTE: You must pass in arguments specific to the wrapped model as keyword arguments. """ if kwargs is not None: wrapped_model_kwargs, from_pretrained_kwargs = cls._split_kwargs(kwargs) else: from_pretrained_kwargs = {} wrapped_model_kwargs = {} if isinstance(pretrained_model_name_or_path, str): # Load the base model using the `transformers` AutoClass (e.g. AutoModelForCausalLM) base_model = cls._auto_model_parent_class.from_pretrained( pretrained_model_name_or_path, *model_args, **from_pretrained_kwargs ) elif isinstance(pretrained_model_name_or_path, transformers.PreTrainedModel): base_model = pretrained_model_name_or_path else: raise ValueError( f"Invalid type for `base_model_name_or_path`: {type(pretrained_model_name_or_path)}" "Expected `str` or `transformers.PreTrainedModel`." ) config = from_pretrained_kwargs.get("config", None) if config is not None: base_model.config.lora_rank = config.train.lora_rank base_model.config.lora_alpha = config.train.lora_alpha base_model.config.lora_train_bias = config.train.lora_train_bias model = cls(base_model, **wrapped_model_kwargs) if isinstance(pretrained_model_name_or_path, str): filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin") sharded_index_filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin.index.json") is_sharded = False if not os.path.exists(filename): try: filename = hf_hub_download(pretrained_model_name_or_path, "pytorch_model.bin") # Sharded except Exception: if os.path.exists(sharded_index_filename): index_file_name = sharded_index_filename else: index_file_name = hf_hub_download( pretrained_model_name_or_path, "pytorch_model.bin.index.json", ) with open(index_file_name, "r") as f: index = json.load(f) # Collect files containing weights from supported modules files_to_download = set() for k, v in index["weight_map"].items(): if any([module in k for module in cls._supported_modules]): files_to_download.add(v) is_sharded = True if is_sharded: # Merge each shard into a state dict # TODO: Optimize this to avoid wasting RAM state_dict = {} for shard_file in files_to_download: filename = os.path.join(pretrained_model_name_or_path, shard_file) # Download if shard file doesn't exist locally if not os.path.exists(filename): filename = hf_hub_download(pretrained_model_name_or_path, shard_file) state_dict.update(torch.load(filename, map_location="cpu")) else: state_dict = torch.load(filename, map_location="cpu") else: state_dict = pretrained_model_name_or_path.state_dict() model.post_init(state_dict=state_dict) return model def save_pretrained(self, *args, **kwargs): """Save the pretrained model to a directory. This method is a wrapper around `transformers.PreTrainedModel.save_pretrained`. Please refer to the documentation of `transformers.PreTrainedModel.save_pretrained` for more information. Args: *args (`list`, *optional*): Positional arguments passed along to the underlying model's `save_pretrained` method. **kwargs (`dict`, *optional*): Keyword arguments passed along to the underlying model's `save_pretrained` method. """ state_dict = kwargs.pop("state_dict", None) if state_dict is None: state_dict = self.state_dict() kwargs["state_dict"] = state_dict return self.base_model.save_pretrained(*args, **kwargs) def state_dict(self, *args, **kwargs): """Return the state_dict of the pretrained model.""" raise NotImplementedError def post_init(self, *args, **kwargs): """Post initialization method. This method is called after the model is instantiated and loaded from a checkpoint. It can be used to perform additional operations such as loading the state_dict. """ raise NotImplementedError def get_compatible_forward_kwargs(self, **kwargs) -> Dict[str, Any]: """Filter out arguments not supported by the specific instance of `base_model.transformer.forward` """ # FIXME: This is a hack to get around the fact that the `transformers` # architectures we use don't have a consistent API for `forward` parameters. return {k: v for k, v in kwargs.items() if k in self.forward_kwargs} # KL Controllers class AdaptiveKLController: """Adaptive KL Controller as described in Ziegler et al. "Fine-Tuning Language Models from Human Preferences" Reference: Section 2.2 https://arxiv.org/pdf/1909.08593.pdf#page=2 Source: https://github.com/openai/lm-human-preferences/blob/master/lm_human_preferences/train_policy.py """ def __init__(self, init_kl_coef: float, target: float, horizon: int): self.value = init_kl_coef self.target = target self.horizon = horizon def update(self, current: float, n_steps: int): """Returns adaptively updated KL coefficient, βₜ₊₁. Arguments: current: The current KL value between the newest policy and the initial policy. """ proportional_error = np.clip(current / self.target - 1, -0.2, 0.2) # ϵₜ mult = 1 + proportional_error * n_steps / self.horizon self.value *= mult # βₜ₊₁ class FixedKLController: """Fixed KL controller.""" def __init__(self, kl_coef): self.value = kl_coef def update(self, current: float, n_steps: int): """Returns updated KL coefficient, βₜ₊₁. Arguments: current: The current KL value between the newest policy and the initial policy. """ pass # CausalLM architectures @dataclass class CausalLMOutputWithValue(ModelOutput): loss: Optional[torch.FloatTensor] = None logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None value: Optional[torch.FloatTensor] = None class AutoModelForCausalLMWithValueHead(PreTrainedModelWrapper): """An `AutoModel` class wrapper for `transformers` causal models that have a language modeling head and a value head """ _auto_model_parent_class = transformers.AutoModelForCausalLM _supported_modules = ["v_head"] _supported_args = [] def __init__( self, base_model: transformers.PreTrainedModel, **kwargs ): super().__init__(base_model) self.v_head = make_head(hf_get_hidden_size(self.base_model.config), 1) def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, position_ids: Optional[List[torch.FloatTensor]] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithValue]: forward_kwargs = self.get_compatible_forward_kwargs( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) forward_kwargs["output_hidden_states"] = True forward_kwargs["return_dict"] = True outputs = self.base_model(**forward_kwargs) value = self.v_head(outputs.hidden_states[-1]).squeeze(-1) if not return_dict: outputs = (outputs.logits,) + outputs[1:] + (value,) return outputs return CausalLMOutputWithValue(**outputs, value=value) def generate(self, *args, **kwargs) -> Union[ModelOutput, torch.LongTensor]: return self.base_model.generate(*args, **kwargs) def state_dict(self, *args, **kwargs): """ Returns the state dictionary of the model. We add the state dictionary of the value head to the state dictionary of the wrapped model by prepending the key with `v_head.`. """ base_model_state_dict = self.base_model.state_dict(*args, **kwargs) v_head_state_dict = self.v_head.state_dict(*args, **kwargs) for k, v in v_head_state_dict.items(): base_model_state_dict[f"v_head.{k}"] = v return base_model_state_dict def post_init(self, state_dict): """ Adds the state dictionary of the value head to the state dictionary of the wrapped model by prepending the key with `v_head.`. This function removes the `v_head.` prefix from the keys of the value head state dictionary. """ for k in list(state_dict.keys()): if "v_head." in k: state_dict[k.replace("v_head.", "")] = state_dict.pop(k) self.v_head.load_state_dict(state_dict, strict=False) del state_dict gc.collect() # noqa: E702 class AutoModelForCausalLMWithHydraValueHead(AutoModelForCausalLMWithValueHead): _supported_modules = ["v_head", "frozen_head"] _supported_args = ["num_layers_unfrozen"] def __init__( self, base_model: transformers.PreTrainedModel, num_layers_unfrozen: int = -1, ): super().__init__(base_model) self.num_layers_unfrozen = num_layers_unfrozen if self.num_layers_unfrozen > 0: config = self.base_model.config branch_class = hf_get_branch_class(config) self.frozen_head = branch_class( self.base_model, num_layers_unfrozen=self.num_layers_unfrozen, ).eval() if base_model.config.lora_rank > 0: config = LoraConfig( r=base_model.config.lora_rank, lora_alpha=base_model.config.lora_alpha, target_modules=config.target_modules.split(","), lora_dropout=0.05, bias=base_model.config.lora_train_bias, task_type=config.task_type ) self.base_model.enable_input_require_grads() self.base_model = get_peft_model(base_model, config) # convert_to_lora_recursively(base_model, base_model.config.lora_rank, base_model.config.lora_alpha) # lora.mark_only_lora_as_trainable(base_model, base_model.config.lora_train_bias) def forward_hydra( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, position_ids: Optional[List[torch.FloatTensor]] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[torch.FloatTensor, CausalLMOutputWithValue]: forward_kwargs = self.get_compatible_forward_kwargs( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) return_dict = forward_kwargs.get("return_dict", True) forward_kwargs["return_dict"] = True forward_kwargs["output_hidden_states"] = True outputs = self.forward(**forward_kwargs) # Select the hidden state before the first branching layer input_hidden_state = outputs.hidden_states[-(self.num_layers_unfrozen + 1)] output_shape = outputs.hidden_states[-1].size() forward_kwargs.pop("input_ids", None) # Ignore `input_ids` for branch head forward_kwargs.pop("inputs_embeds", None) # Ignore `inputs_embeds` for branch head hydra_outputs = self.frozen_head(input_hidden_state, output_shape, **forward_kwargs) if not return_dict: return hydra_outputs.logits return hydra_outputs @classmethod def from_pretrained( # noqa: max-complexity cls, pretrained_model_name_or_path: Union[str, transformers.PreTrainedModel], *model_args, **kwargs, ): """Instantiate a pretrained pytorch model from a pretrained model configuration. This method is a wrapper around `transformers.PreTrainedModel.from_pretrained`. Please refer to the documentation of `transformers.PreTrainedModel.from_pretrained` for more information. Args: pretrained_model_name_or_path (str or `transformers.PreTrainedModel`): The identifier of the pretrained model to load or the pretrained model itself. *model_args (sequence of positional arguments, *optional*): All remaining positional arguments will be passed to the `_auto_model_parent_class`. **kwargs (dict, *optional*): Dictionary of keyword arguments to pass to both the underlying `_auto_model_parent_class` call (e.g. `transformers.AutoModelForCausalLM.from_pretrained`) and the specific instance of the wrapped model. NOTE: You must pass in arguments specific to the wrapped model as keyword arguments. """ if kwargs is not None: wrapped_model_kwargs, from_pretrained_kwargs = cls._split_kwargs(kwargs) else: from_pretrained_kwargs = {} wrapped_model_kwargs = {} if isinstance(pretrained_model_name_or_path, str): # Load the base model using the `transformers` AutoClass (e.g. AutoModelForCausalLM) base_model = cls._auto_model_parent_class.from_pretrained( pretrained_model_name_or_path, *model_args, **from_pretrained_kwargs ) elif isinstance(pretrained_model_name_or_path, transformers.PreTrainedModel): base_model = pretrained_model_name_or_path else: raise ValueError( f"Invalid type for `base_model_name_or_path`: {type(pretrained_model_name_or_path)}" "Expected `str` or `transformers.PreTrainedModel`." ) # TODO: add model.resize_token_embeddings(tokenizer.vocab_size) # config = from_pretrained_kwargs.get("config", None) # if config is not None: # base_model.config.lora_rank = config.train.lora_rank # base_model.config.lora_alpha = config.train.lora_alpha # base_model.config.lora_train_bias = config.train.lora_train_bias if isinstance(pretrained_model_name_or_path, str): filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin") sharded_index_filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin.index.json") is_sharded = False if not os.path.exists(filename): try: filename = hf_hub_download(pretrained_model_name_or_path, "pytorch_model.bin") # Sharded except Exception: if os.path.exists(sharded_index_filename): index_file_name = sharded_index_filename else: index_file_name = hf_hub_download( pretrained_model_name_or_path, "pytorch_model.bin.index.json", ) with open(index_file_name, "r") as f: index = json.load(f) # Collect files containing weights from supported modules files_to_download = set() for k, v in index["weight_map"].items(): if any([module in k for module in cls._supported_modules]): files_to_download.add(v) is_sharded = True if is_sharded: # Merge each shard into a state dict # TODO: Optimize this to avoid wasting RAM state_dict = {} for shard_file in files_to_download: filename = os.path.join(pretrained_model_name_or_path, shard_file) # Download if shard file doesn't exist locally if not os.path.exists(filename): filename = hf_hub_download(pretrained_model_name_or_path, shard_file) state_dict.update(torch.load(filename, map_location="cpu")) else: state_dict = torch.load(filename, map_location="cpu") else: state_dict = pretrained_model_name_or_path.state_dict() # Check if sft model is LoRA checkpoint, load the state dict into model is_lora_checkpoint = False for key in state_dict.keys(): if "lora" in key: is_lora_checkpoint = True break if is_lora_checkpoint: # base_model = SFTModelWithLoRA(base_model.config, base_model) res = base_model.load_state_dict(state_dict, strict=False) model = cls(base_model, **wrapped_model_kwargs) model.post_init(state_dict=state_dict) return model class ModelBranch(transformers.PreTrainedModel): """Implements the frozen upper trunk of the pretrained reference model used when computing the PPO KL-divergence penalty. """ def __init__( self, base_model: transformers.PreTrainedModel, *, num_layers_unfrozen: int, ): """ Args: base_model (transformers.PreTrainedModel): The pretrained model to extract upper trunk from num_layers_unfrozen (int): The number of trainable layers """ super().__init__(base_model.config) # The branch is defined by the last `num_layers_unfrozen` layers of the pretrained model decoder_blocks = deepcopy(hf_get_decoder_blocks(base_model)) self.decoder_blocks = nn.ModuleList(list(decoder_blocks)[-num_layers_unfrozen:]) self.final_norm = deepcopy(hf_get_decoder_final_norm(base_model)) self.lm_head = deepcopy(hf_get_lm_head(base_model)) self.hidden_size = hf_get_hidden_size(self.config) self.model_parallel = False self.device_map = None self.last_device = None self.gradient_checkpointing = False # Freeze the entire branch for parameter in self.parameters(): parameter.requires_grad_(False) class GPTModelBranch(ModelBranch): def forward( # noqa: max-complexity self, hidden_states: torch.Tensor, # Takes as input hidden_states instead of input_ids output_shape: torch.Tensor, # output_size given by main trunk past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = False, ) -> Union[Tuple, CausalLMOutputWithValue]: """Reference: https://github.com/huggingface/transformers/blob/2411f0e465e761790879e605a4256f3d4afb7f82/src/transformers/models/gpt2/modeling_gpt2.py#L743 # noqa: E501 """ batch_size = hidden_states.size()[0] output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict device = hidden_states.device if past_key_values is None: past_key_values = tuple([None] * len(self.decoder_blocks)) if attention_mask is not None: if batch_size <= 0: raise ValueError("batch_size has to be defined and > 0") attention_mask = attention_mask.view(batch_size, -1) attention_mask = attention_mask[:, None, None, :] attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min if self.config.add_cross_attention and encoder_hidden_states is not None: ( encoder_batch_size, encoder_sequence_length, _, ) = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_attention_mask = None head_mask = self.get_head_mask(head_mask, hf_get_num_hidden_layers(self.config)) presents = () if use_cache else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.decoder_blocks, past_key_values)): if self.model_parallel: torch.cuda.set_device(hidden_states.device) if layer_past is not None: layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past) if attention_mask is not None: attention_mask = attention_mask.to(hidden_states.device) if isinstance(head_mask, torch.Tensor): head_mask = head_mask.to(hidden_states.device) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) # Assumes we are never training the branch block_params = inspect.getfullargspec(block.forward).args if "encoder_hidden_states" in block_params: outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=use_cache, output_attentions=output_attentions, ) else: outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = outputs[0] if use_cache is True: presents = presents + (outputs[1],) if output_attentions: all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) if self.config.add_cross_attention: all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],) if self.model_parallel: for k, v in self.device_map.items(): if i == v[-1] and "cuda:" + str(k) != self.last_device: hidden_states = hidden_states.to("cuda:" + str(k + 1)) hidden_states = self.final_norm(hidden_states) hidden_states = hidden_states.view(output_shape) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.model_parallel: torch.cuda.set_device(self.transformer.first_device) hidden_states = hidden_states.to(self.lm_head.weight.device) lm_logits = self.lm_head(hidden_states) if not return_dict: outputs = (lm_logits,) + (None,) + (None,) return outputs return CausalLMOutputWithValue( logits=lm_logits, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, cross_attentions=all_cross_attentions, ) class OPTModelBranch(ModelBranch): def forward( # noqa: max-complexity self, hidden_states: torch.Tensor, output_shape: torch.Tensor, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = False, ) -> Union[Tuple, CausalLMOutputWithValue]: """Reference: https://github.com/huggingface/transformers/blob/bdb84e2bada3658f99c6a81c963ec562f8485151/src/transformers/models/opt/modeling_opt.py#L840 # noqa: E501 """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones(hidden_states.shape[:2], dtype=torch.bool, device=hidden_states.device) input_shape = hidden_states.size()[:-1] combined_attention_mask = None if input_shape[-1] > 1: combined_attention_mask = modeling_opt._make_causal_mask( input_shape, hidden_states.dtype, past_key_values_length=past_key_values_length, ).to(hidden_states.device) if attention_mask is not None: expanded_attn_mask = modeling_opt._expand_mask( attention_mask, hidden_states.dtype, tgt_len=input_shape[-1] ).to(hidden_states.device) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask ) attention_mask = combined_attention_mask all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None next_decoder_cache = () if use_cache else None for attn_mask, mask_name in zip([head_mask], ["head_mask"]): if attn_mask is not None: if attn_mask.size()[0] != (len(self.decoder_blocks)): raise ValueError( f"The `{mask_name}` should be specified for {len(self.decoder_blocks)} layers, but it is for" f" {head_mask.size()[0]}." ) for idx, decoder_layer in enumerate(self.decoder_blocks): if output_hidden_states: all_hidden_states += (hidden_states,) past_key_value = past_key_values[idx] if past_key_values is not None else None layer_outputs = decoder_layer( hidden_states, past_key_value=past_key_value, attention_mask=attention_mask, layer_head_mask=(head_mask[idx] if head_mask is not None else None), use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) if output_attentions: all_self_attns += (layer_outputs[1],) if self.final_norm is not None: hidden_states = self.final_norm(hidden_states) # TODO: Add output projection support # https://github.com/huggingface/transformers/blob/699e90437f984d69ad3c9b891dd2e9d0fc2cffe4/src/transformers/models/opt/modeling_opt.py#L499 # noqa: E501 # if self.project_out is not None: # hidden_states = self.project_out(hidden_states) if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None lm_logits = self.lm_head(hidden_states).contiguous() if not return_dict: return tuple( v for v in [ lm_logits, hidden_states, next_cache, all_hidden_states, all_self_attns, ] if v is not None ) return CausalLMOutputWithValue( logits=lm_logits, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, ) class BloomModelBranch(ModelBranch): def forward( # noqa: max-complexity self, hidden_states: torch.Tensor, # Takes as input hidden_states instead of input_ids output_shape: torch.Tensor, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = False, ) -> Union[Tuple, CausalLMOutputWithValue]: """Reference: https://github.com/huggingface/transformers/blob/2411f0e465e761790879e605a4256f3d4afb7f82/src/transformers/models/bloom/modeling_bloom.py#L623 # noqa: E501 """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict batch_size, seq_length = hidden_states.shape[:2] if past_key_values is None: past_key_values = tuple([None] * len(self.decoder_blocks)) head_mask = self.get_head_mask(head_mask, hf_get_num_hidden_layers(self.config)) presents = () if use_cache else None all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None seq_length_with_past = seq_length past_key_values_length = 0 if past_key_values[0] is not None: past_key_values_length = past_key_values[0][0].shape[2] seq_length_with_past = seq_length_with_past + past_key_values_length if attention_mask is None: attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device) else: attention_mask = attention_mask.to(hidden_states.device) alibi = modeling_bloom.build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype) combined_attention_mask = None device = attention_mask.device input_shape = (batch_size, seq_length) _, src_length = input_shape if src_length > 1: combined_attention_mask = modeling_bloom._make_causal_mask( input_shape, device=device, past_key_values_length=past_key_values_length, ) expanded_attn_mask = modeling_bloom._expand_mask(attention_mask, tgt_length=src_length) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask ) causal_mask = combined_attention_mask for i, (block, layer_past) in enumerate(zip(self.decoder_blocks, past_key_values)): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = block( hidden_states, layer_past=layer_past, attention_mask=causal_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, alibi=alibi, ) hidden_states = outputs[0] if use_cache is True: presents = presents + (outputs[1],) if output_attentions: all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) hidden_states = self.final_norm(hidden_states) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) lm_logits = self.lm_head(hidden_states) if not return_dict: return tuple( v for v in [ lm_logits, hidden_states, presents, all_hidden_states, all_self_attentions, ] if v is not None ) return CausalLMOutputWithValue( logits=lm_logits, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, ) # Seq2Seq architectures @dataclass class Seq2SeqLMOutputWithValue(ModelOutput): loss: Optional[torch.FloatTensor] = None logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None encoder_last_hidden_state: Optional[torch.FloatTensor] = None encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None value: Optional[torch.FloatTensor] = None class AutoModelForSeq2SeqLMWithValueHead(PreTrainedModelWrapper): """An `AutoModel` class wrapper for `transformers` sequence-to-sequence models that have a language modeling head and a value head """ _auto_model_parent_class = transformers.AutoModelForSeq2SeqLM _supported_modules = ["v_head"] _supported_args = [] def __init__( self, base_model: transformers.PreTrainedModel, **kwargs ): super().__init__(base_model) self.v_head = make_head(hf_get_hidden_size(self.base_model.config), 1) def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, position_ids: Optional[torch.FloatTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None, past_key_values: Optional[Tuple[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, decoder_head_mask: Optional[torch.FloatTensor] = None, cross_attn_head_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = True, output_hidden_states: Optional[bool] = True, return_dict: Optional[bool] = None, ) -> Seq2SeqLMOutputWithValue: forward_kwargs = self.get_compatible_forward_kwargs( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, encoder_outputs=encoder_outputs, past_key_values=past_key_values, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, head_mask=head_mask, decoder_head_mask=decoder_head_mask, cross_attn_head_mask=cross_attn_head_mask, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) forward_kwargs["output_hidden_states"] = True forward_kwargs["return_dict"] = True # print(f"forward_kwargs: {forward_kwargs.keys()}, input_ids shape: {forward_kwargs['input_ids'].shape}, attention_mask shape: {forward_kwargs['attention_mask'].shape}") outputs = self.base_model(**forward_kwargs) # print(f"outputs: {outputs.keys()}") try: hidden_states = outputs.mems except KeyError: hidden_states = outputs.hidden_states last_hidden_state = hidden_states[-1] value = self.v_head(last_hidden_state).squeeze(-1) return Seq2SeqLMOutputWithValue(loss=outputs.loss, logits=outputs.logits, decoder_hidden_states=hidden_states, value=value) def generate(self, *args, **kwargs) -> Union[ModelOutput, torch.LongTensor]: return self.base_model.generate(*args, **kwargs) def state_dict(self, *args, **kwargs): """ Returns the state dictionary of the model. We add the state dictionary of the value head to the state dictionary of the wrapped model by prepending the key with `v_head.`. """ base_model_state_dict = self.base_model.state_dict(*args, **kwargs) v_head_state_dict = self.v_head.state_dict(*args, **kwargs) for k, v in v_head_state_dict.items(): base_model_state_dict[f"v_head.{k}"] = v return base_model_state_dict def post_init(self, state_dict): """ We add the state dictionary of the value head to the state dictionary of the wrapped model by prepending the key with `v_head.`. This function removes the `v_head.` prefix from the keys of the value head state dictionary. """ for k in list(state_dict.keys()): if "v_head." in k: state_dict[k.replace("v_head.", "")] = state_dict.pop(k) self.v_head.load_state_dict(state_dict, strict=False) del state_dict gc.collect() # noqa: E702 class AutoModelForSeq2SeqLMWithHydraValueHead(AutoModelForSeq2SeqLMWithValueHead): _supported_modules = ["v_head", "frozen_head"] _supported_args = ["num_layers_unfrozen"] def __init__( self, base_model: transformers.PreTrainedModel, num_layers_unfrozen: int = -1, ): super().__init__(base_model=base_model) self.num_layers_unfrozen = num_layers_unfrozen if self.num_layers_unfrozen > 0: branch_class = T5Branch # TODO: Add support for other model branches self.frozen_head = branch_class( self.base_model, num_layers_unfrozen=self.num_layers_unfrozen, ).eval() # if base_model.config.lora_rank > 0: # convert_to_lora_recursively(base_model, base_model.config.lora_rank, base_model.config.lora_alpha) # lora.mark_only_lora_as_trainable(base_model, base_model.config.lora_train_bias) def forward_hydra( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None, past_key_values: Optional[Tuple[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, decoder_head_mask: Optional[torch.FloatTensor] = None, cross_attn_head_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Seq2SeqLMOutputWithValue: forward_kwargs = self.get_compatible_forward_kwargs( input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, encoder_outputs=encoder_outputs, past_key_values=past_key_values, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, head_mask=head_mask, decoder_head_mask=decoder_head_mask, cross_attn_head_mask=cross_attn_head_mask, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) return_dict = forward_kwargs.get("return_dict", True) forward_kwargs["output_hidden_states"] = True forward_kwargs["return_dict"] = True outputs = self.forward(**forward_kwargs) # Select the hidden state before the first branching layer input_hidden_state = outputs.decoder_hidden_states[-(self.num_layers_unfrozen + 1)] hydra_outputs = self.frozen_head( hidden_states=input_hidden_state, attention_mask=decoder_attention_mask, encoder_hidden_states=outputs.encoder_last_hidden_state, encoder_attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=True, return_dict=return_dict, ) if not return_dict: return hydra_outputs.logits return hydra_outputs @classmethod def from_pretrained( # noqa: max-complexity cls, pretrained_model_name_or_path: Union[str, transformers.PreTrainedModel], *model_args, **kwargs, ): """Instantiate a pretrained pytorch model from a pretrained model configuration. This method is a wrapper around `transformers.PreTrainedModel.from_pretrained`. Please refer to the documentation of `transformers.PreTrainedModel.from_pretrained` for more information. Args: pretrained_model_name_or_path (str or `transformers.PreTrainedModel`): The identifier of the pretrained model to load or the pretrained model itself. *model_args (sequence of positional arguments, *optional*): All remaining positional arguments will be passed to the `_auto_model_parent_class`. **kwargs (dict, *optional*): Dictionary of keyword arguments to pass to both the underlying `_auto_model_parent_class` call (e.g. `transformers.AutoModelForCausalLM.from_pretrained`) and the specific instance of the wrapped model. NOTE: You must pass in arguments specific to the wrapped model as keyword arguments. """ if kwargs is not None: wrapped_model_kwargs, from_pretrained_kwargs = cls._split_kwargs(kwargs) else: from_pretrained_kwargs = {} wrapped_model_kwargs = {} if isinstance(pretrained_model_name_or_path, str): # Load the base model using the `transformers` AutoClass (e.g. AutoModelForCausalLM) base_model = cls._auto_model_parent_class.from_pretrained( pretrained_model_name_or_path, *model_args, **from_pretrained_kwargs ) elif isinstance(pretrained_model_name_or_path, transformers.PreTrainedModel): base_model = pretrained_model_name_or_path else: raise ValueError( f"Invalid type for `base_model_name_or_path`: {type(pretrained_model_name_or_path)}" "Expected `str` or `transformers.PreTrainedModel`." ) config = from_pretrained_kwargs.get("config", None) # if config is not None: # base_model.config.lora_rank = config.train.lora_rank # base_model.config.lora_alpha = config.train.lora_alpha # base_model.config.lora_train_bias = config.train.lora_train_bias if isinstance(pretrained_model_name_or_path, str): filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin") sharded_index_filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin.index.json") is_sharded = False if not os.path.exists(filename): try: filename = hf_hub_download(pretrained_model_name_or_path, "pytorch_model.bin") # Sharded except Exception: if os.path.exists(sharded_index_filename): index_file_name = sharded_index_filename else: index_file_name = hf_hub_download( pretrained_model_name_or_path, "pytorch_model.bin.index.json", ) with open(index_file_name, "r") as f: index = json.load(f) # Collect files containing weights from supported modules files_to_download = set() for k, v in index["weight_map"].items(): if any([module in k for module in cls._supported_modules]): files_to_download.add(v) is_sharded = True if is_sharded: # Merge each shard into a state dict # TODO: Optimize this to avoid wasting RAM state_dict = {} for shard_file in files_to_download: filename = os.path.join(pretrained_model_name_or_path, shard_file) # Download if shard file doesn't exist locally if not os.path.exists(filename): filename = hf_hub_download(pretrained_model_name_or_path, shard_file) state_dict.update(torch.load(filename, map_location="cpu")) else: state_dict = torch.load(filename, map_location="cpu") else: state_dict = pretrained_model_name_or_path.state_dict() # Check if sft model is LoRA checkpoint, load the state dict into model is_lora_checkpoint = False for key in state_dict.keys(): if "lora" in key: is_lora_checkpoint = True break if is_lora_checkpoint: # base_model = SFTModelWithLoRA(base_model.config, base_model) res = base_model.load_state_dict(state_dict, strict=False) model = cls(base_model, **wrapped_model_kwargs) model.post_init(state_dict=state_dict) return model class T5Branch(ModelBranch): """Decoder only T5 branch""" def __init__( self, base_model: transformers.PreTrainedModel, *, num_layers_unfrozen: int, ): super().__init__(base_model, num_layers_unfrozen=num_layers_unfrozen) self.dropout = hf_get_decoder(base_model).dropout self.is_decoder = True def forward( # noqa: max-complexity self, hidden_states: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.LongTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqLMOutputWithValue]: """Reference: https://github.com/huggingface/transformers/blob/bc21aaca789f1a366c05e8b5e111632944886393/src/transformers/models/t5/modeling_t5.py#L899 # noqa: E501 """ batch_size, seq_length = hidden_states.shape[:2] input_shape = (batch_size, seq_length) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if attention_mask is None: attention_mask = torch.ones(batch_size, seq_length, device=hidden_states.device) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: encoder_seq_length = encoder_hidden_states.shape[1] encoder_attention_mask = torch.ones( batch_size, encoder_seq_length, device=hidden_states.device, dtype=torch.long ) extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) if self.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=hidden_states.device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None position_bias = None encoder_decoder_position_bias = None for _, layer_module in enumerate(self.decoder_blocks): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask=extended_attention_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, use_cache=use_cache, output_attentions=output_attentions, ) if use_cache is False: layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] hidden_states, present_key_value_state = layer_outputs[:2] position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) hidden_states = self.final_norm(hidden_states) hidden_states = self.dropout(hidden_states) # Add last layer if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) sequence_output = hidden_states if self.config.tie_word_embeddings: # Rescale output before projecting on vocab # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 # noqa: E501 sequence_output = sequence_output * (self.config.d_model**-0.5) lm_logits = self.lm_head(sequence_output) if not return_dict: return (lm_logits,) return Seq2SeqLMOutputWithValue( logits=lm_logits, decoder_hidden_states=all_hidden_states, decoder_attentions=all_attentions, ) # Branch class utils def hf_get_branch_class( config: transformers.PretrainedConfig, ) -> "ModelBranch": """Returns the model branch class for the given config.""" gpt_branch_supported_archs = [ "GPTJForCausalLM", "GPT2LMHeadModel", "GPTNeoForCausalLM", "GPTNeoXForCausalLM", ] opt_branch_supported_archs = ["OPTForCausalLM"] bloom_branch_supported_archs = ["BloomModel", "BloomForCausalLM"] arch = config.architectures[0] if arch in gpt_branch_supported_archs: return GPTModelBranch elif arch in opt_branch_supported_archs: return OPTModelBranch elif arch in bloom_branch_supported_archs: return BloomModelBranch else: all_supported_archs = sum( [ gpt_branch_supported_archs, opt_branch_supported_archs, bloom_branch_supported_archs, ], [], ) raise ValueError( f"Unsupported architecture: `{arch}`. The following architectures are " f"available for model branching:\n{all_supported_archs}" ) ================================================ FILE: src/models/reward.py ================================================ import torch from torch import nn from transformers.modeling_utils import PreTrainedModel class RewardModel(PreTrainedModel): supports_gradient_checkpointing = True def __init__(self, config, model, tokenizer): super().__init__(config) self.config = config self.model_type = config.model_type self.pad_id = tokenizer.pad_token_id self.transformer = model # set the device of `v_head` the same as the device of the last component of `model` st = model.state_dict() device = st[list(st.keys())[-1]].device self.v_head = nn.Linear(config.hidden_size, 1, bias=False, device=device) # self.loss_fn = PairWiseLoss() def gradient_checkpointing_enable(self): self.transformer.gradient_checkpointing_enable() def gradient_checkpointing_disable(self): self.transformer.gradient_checkpointing_disable() def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, PreTrainedModel): module.gradient_checkpointing = value def reward( self, input_ids=None, attention_mask=None, position_ids=None, use_cache=None, ): batch_size = input_ids.shape[0] transformer_outputs = self.transformer(input_ids, attention_mask=attention_mask, position_ids=position_ids, use_cache=use_cache) if self.model_type == "glm": hidden_states = transformer_outputs.mems[-1] elif self.model_type == "chatglm": hidden_states = transformer_outputs[0].permute(1, 0, 2) elif self.model_type == "pangu": hidden_states = transformer_outputs[0] hidden_states = hidden_states.squeeze(1) else: hidden_states = transformer_outputs[0] assert len(hidden_states.shape) == 3 values = self.v_head(hidden_states).squeeze(-1) # outputs = self.body(sequences, attention_mask=attention_mask) # last_hidden_states = outputs['last_hidden_state'] # values = self.value_head(last_hidden_states)[:, :-1] rewards = None # rewards = values.mean(dim=-1) # if len(rewards.shape) == 2: # rewards = rewards.squeeze(1) # ensure shape is (B) # # assert len(rewards.shape) == 1 and rewards.shape[0] == batch_size return values, rewards def forward( self, chosen_input_ids, chosen_attention_mask=None, chosen_position_ids=None, rejected_input_ids=None, rejected_attention_mask=None, rejected_position_ids=None, use_cache=None, **kwargs ): bs = chosen_input_ids.shape[0] chosen_end_scores = [] rejected_end_scores = [] # compute reward for chosen inputs chosen_values, chosen_reward = self.reward(chosen_input_ids, attention_mask=chosen_attention_mask, position_ids=chosen_position_ids, use_cache=use_cache) if len(chosen_input_ids.shape) == 3: chosen_input_ids = chosen_input_ids.squeeze(1) # compute reward for rejected inputs if it is not none if rejected_input_ids is not None: reject_values, reject_reward = self.reward(rejected_input_ids, attention_mask=rejected_attention_mask, position_ids=rejected_position_ids, use_cache=use_cache) # loss = self.loss_fn(chosen_reward, reject_reward) if len(rejected_input_ids.shape) == 3: rejected_input_ids = rejected_input_ids.squeeze(1) loss = 0 for i in range(bs): # Check if there is any padding otherwise take length of sequence c_inds = (chosen_input_ids[i] == self.pad_id).nonzero() c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen_input_ids.shape[1] r_inds = (rejected_input_ids[i] == self.pad_id).nonzero() r_ind = r_inds[0].item() if len(r_inds) > 0 else rejected_input_ids.shape[1] end_ind = max(c_ind, r_ind) # Retrieve first index where trajectories diverge divergence_ind = (chosen_input_ids[i] != rejected_input_ids[i]).nonzero()[0] assert divergence_ind > 0 # Index into the correct rewards c_truncated_reward = chosen_values[i][divergence_ind:end_ind] r_truncated_reward = reject_values[i][divergence_ind:end_ind] # Use the last non-padding token output as reward score chosen_end_scores.append(chosen_values[i][c_ind-1]) rejected_end_scores.append(reject_values[i][r_ind-1]) # Compute loss loss += -torch.log(torch.sigmoid(c_truncated_reward - r_truncated_reward)).mean() loss = loss / bs rejected_end_scores = torch.stack(rejected_end_scores) else: reject_values = None rejected_end_scores = None loss = None for i in range(bs): c_inds = (chosen_input_ids[i] == self.pad_id).nonzero() c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen_input_ids.shape[1] chosen_end_scores.append(chosen_values[i, c_ind - 1]) chosen_end_scores = torch.stack(chosen_end_scores) return { "loss": loss, "chosen_values": chosen_values, "chosen_reward": chosen_end_scores, "reject_values": reject_values, "reject_reward": rejected_end_scores } ================================================ FILE: src/models/rlhf_engine.py ================================================ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 import os import time import torch import deepspeed import math import json from deepspeed.ops.adam import FusedAdam from deepspeed.ops.adam import DeepSpeedCPUAdam from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, get_scheduler from transformers import AutoConfig, AutoModel from transformers.deepspeed import HfDeepSpeedConfig from peft import ( prepare_model_for_kbit_training, LoraConfig, get_peft_model ) from src.utils.config import get_train_ds_config, get_eval_ds_config # from utils.module.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters from src.models import RewardModel from src.utils.logger import logger, RESOURCE_PATH from src.utils.modeling_utils import get_optimizer_grouped_parameters """ TODOs: * support HF models for critic (for debugging), must be a previously saved ckpt from step-2 * determine ds_config/zero_stage based on model size, gpu style, world size, etc - get model size by creating simple meta model - 1.3b: zero-2 for actor/ref models, zero-0 for others - 13b+: zero-3 for all models """ def log_init(model_name, rank, stime=None): if rank == 0: tag = "start" if stime is None else "end" suffix = "ing" if stime is None else "ed" duration = "" if stime is not None: duration = "(duration: {:.2f}s)".format(time.time() - stime) logger.info(f"[{tag}] Initializ{suffix} {model_name} Model [{tag}] {duration}") return time.time() def create_hf_model(model_class, model_name_or_path, tokenizer, ds_config=None, rlhf_training=False, disable_dropout=False): model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True) if disable_dropout: model_config.dropout = 0.0 # Note: dschf is defined in function scope to avoid global effects # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3: dschf = HfDeepSpeedConfig(ds_config) else: dschf = None if rlhf_training: # the weight loading is handled by create critic model model = model_class.from_config(model_config, trust_remote_code=True) else: model = model_class.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path), config=model_config, trust_remote_code=True) model.config.end_token_id = tokenizer.eos_token_id # model.config.pad_token_id = model.config.eos_token_id # model.resize_token_embeddings(int( # 8 * # math.ceil(len(tokenizer) / 8.0))) # make the vocab size multiple of 8 if "pangu" in model_name_or_path: model.config.target_modules = "q_proj,k_proj,v_proj" model.config.task_type = "CAUSAL_LM" elif "chatglm" in model_name_or_path: model.config.target_modules = "query_key_value" model.config.task_type = "SEQ_2_SEQ_LM" elif "glm" in model_name_or_path: model.config.target_modules = "query_key_value" model.config.task_type = "SEQ_2_SEQ_LM" else: raise ValueError(f"Unsupported model type: {model_name_or_path}") return model def create_critic_model(model_name_or_path, tokenizer, ds_config, num_padding_at_beginning=0, rlhf_training=False, disable_dropout=False, checkpoint=None, lora_rank=0, lora_alpha=1, lora_train_bias="none"): # OPT model family always put a padding token at the beginning of the sequence, # we did not see this in other models but not sure if it is a general rule if "pangu" in model_name_or_path: model_class = AutoModelForCausalLM elif "glm" in model_name_or_path: model_class = AutoModelForSeq2SeqLM else: raise ValueError(f"Unsupported model type: {model_name_or_path}") critic_model = create_hf_model(model_class, model_name_or_path, tokenizer, ds_config, rlhf_training, disable_dropout) critic_model.config.lora_rank = lora_rank critic_model.config.lora_alpha = lora_alpha critic_model.config.lora_train_bias = lora_train_bias if "pangu" in model_name_or_path or "chatglm" in model_name_or_path: critic_model = RewardModel(critic_model.config, critic_model.transformer, tokenizer, # num_padding_at_beginning=num_padding_at_beginning ) else: critic_model = RewardModel(critic_model.config, critic_model.glm, tokenizer) if rlhf_training: assert os.path.exists(checkpoint), f"Cannot find reward model checkpoint at {checkpoint}" critic_model.load_state_dict(torch.load(checkpoint, map_location='cpu')) return critic_model class DeepSpeedRLHFEngine: def __init__(self, actor_model_name_or_path, critic_model_name_or_path, tokenizer, args, num_total_iters): self.args = args self.num_total_iters = num_total_iters self.tokenizer = tokenizer if "pangu" in actor_model_name_or_path: self.model_class = AutoModelForCausalLM elif "glm" in actor_model_name_or_path: self.model_class = AutoModelForSeq2SeqLM else: raise ValueError(f"Unsuppported model type: {actor_model_name_or_path}") self.actor = self._init_actor( actor_model_name_or_path=actor_model_name_or_path) self.ref = self._init_ref( actor_model_name_or_path=actor_model_name_or_path) self.actor_ema = None if self.args.enable_ema: self.actor_ema = self._init_ema( actor_model_name_or_path=actor_model_name_or_path) self.reward = self._init_reward( critic_model_name_or_path=critic_model_name_or_path) if args.enable_joint_optimization: self.critic = self._init_critic( critic_model_name_or_path=critic_model_name_or_path) if self.args.critic_gradient_checkpointing: self.critic.gradient_checkpointing_enable() else: self.critic = None def _init_actor(self, actor_model_name_or_path): stime = log_init("Actor", self.args.local_rank) # DS Config ds_config = get_train_ds_config( global_batch_size=self.args.global_train_batch_size_actor, micro_batch_size=self.args.ppo_train_batch_size, gradient_accumulation_steps=self.args.gradient_accumulation_steps_actor, offload=self.args.offload, stage=self.args.actor_zero_stage, enable_hybrid_engine=self.args.enable_hybrid_engine, inference_tp_size=self.args.inference_tp_size, release_inference_cache=self.args.release_inference_cache, pin_parameters=(not self.args.unpin_actor_parameters), tp_gather_partition_size=self.args.tp_gather_partition_size, max_out_tokens=self.args.max_length) # Model actor_model = create_hf_model( model_class=self.model_class, model_name_or_path=actor_model_name_or_path, tokenizer=self.tokenizer, ds_config=ds_config, disable_dropout=self.args.disable_actor_dropout) # LoRA if self.args.actor_lora_rank > 0: config = LoraConfig( r=self.args.actor_lora_rank, lora_alpha=self.args.actor_lora_alpha, target_modules=actor_model.config.target_modules.split(","), lora_dropout=0.05, task_type=actor_model.config.task_type ) actor_model.enable_input_require_grads() actor_model = get_peft_model(actor_model, config) # actor_model = convert_linear_layer_to_lora( # actor_model, self.args.actor_lora_module_name, # self.args.actor_lora_rank) # if self.args.only_optimize_lora: # actor_model = only_optimize_lora_parameters(actor_model) # Optimizer AdamOptimizer = DeepSpeedCPUAdam if self.args.offload else FusedAdam optim_params = get_optimizer_grouped_parameters( actor_model, self.args.actor_weight_decay) optim = AdamOptimizer(optim_params, lr=self.args.actor_learning_rate, betas=(0.9, 0.95)) # LR Scheduler lr_scheduler = get_scheduler( name=self.args.lr_scheduler_type, optimizer=optim, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.num_total_iters, ) # DeepSpeed Engine actor_engine, *_ = deepspeed.initialize(model=actor_model, optimizer=optim, lr_scheduler=lr_scheduler, config=ds_config) actor_engine.config['pad_token_id'] = actor_model.config.pad_token_id log_init("Actor", self.args.local_rank, stime=stime) return actor_engine def _init_ref(self, actor_model_name_or_path): stime = log_init("Ref", self.args.local_rank) # DS Config zero_stage = self.args.actor_zero_stage if zero_stage != 3: # If actor is ZeRO-3 then we use it for everything, otherwise assume we have enough memory for ref model zero_stage = 0 ds_config = get_eval_ds_config(global_batch_size=self.args.global_train_batch_size_actor, micro_batch_size=self.args.ppo_train_batch_size, gradient_accumulation_steps=self.args.eval_accumulation_steps, offload=self.args.offload_reference_model, stage=zero_stage) ref_model = create_hf_model(self.model_class, actor_model_name_or_path, self.tokenizer, ds_config) ref_engine, *_ = deepspeed.initialize(model=ref_model, config=ds_config) log_init("Ref", self.args.local_rank, stime=stime) return ref_engine def _init_ema(self, actor_model_name_or_path): stime = log_init("EMA", self.args.local_rank) # DS Config zero_stage = self.args.actor_zero_stage if zero_stage != 3: # If actor is ZeRO-3 then we use it for everything, otherwise assume we have enough memory zero_stage = 0 ds_config = get_eval_ds_config(global_batch_size=self.args.global_train_batch_size_actor, micro_batch_size=self.args.ppo_train_batch_size, gradient_accumulation_steps=self.args.eval_accumulation_steps, offload=self.args.offload_reference_model, stage=zero_stage) actor_model_ema = create_hf_model(self.model_class, actor_model_name_or_path, self.tokenizer, ds_config) if self.args.actor_lora_rank > 0: config = LoraConfig( r=self.args.actor_lora_rank, lora_alpha=self.args.actor_lora_alpha, target_modules=actor_model_ema.config.target_modules.split(","), lora_dropout=0.05, task_type=actor_model_ema.config.task_type ) actor_model_ema.enable_input_require_grads() actor_model_ema = get_peft_model(actor_model_ema, config) # actor_model_ema = convert_linear_layer_to_lora( # actor_model_ema, self.args.actor_lora_module_name, # self.args.actor_lora_rank) ema_engine, *_ = deepspeed.initialize(model=actor_model_ema, config=ds_config) log_init("EMA", self.args.local_rank, stime=stime) return ema_engine def _init_critic(self, critic_model_name_or_path): stime = log_init("Critic", self.args.local_rank) ds_config = get_train_ds_config(global_batch_size=self.args.global_train_batch_size_critic, micro_batch_size=self.args.ppo_train_batch_size, gradient_accumulation_steps=self.args.gradient_accumulation_steps, offload=self.args.offload, stage=self.args.critic_zero_stage) #TODO(jeff): should not be needed, we should be able to use ds_config above #TODO(jeff): it means we never create the critic w. zero.init context if we are using ZeRO-3 ds_eval_config = get_eval_ds_config(global_batch_size=self.args.global_train_batch_size_critic, micro_batch_size=self.args.ppo_train_batch_size, gradient_accumulation_steps=self.args.eval_accumulation_steps, offload=False, stage=0) # Model critic_model = create_critic_model( model_name_or_path=critic_model_name_or_path, tokenizer=self.tokenizer, ds_config=ds_eval_config, # num_padding_at_beginning=self.args.num_padding_at_beginning, rlhf_training=True, disable_dropout=self.args.disable_critic_dropout, checkpoint=self.args.critic_checkpoint, lora_rank=self.args.critic_lora_rank, lora_alpha=self.args.lora_alpha, lora_train_bias=self.args.lora_train_bias) # LoRA if self.args.critic_lora_rank > 0: config = LoraConfig( r=self.args.critic_lora_rank, lora_alpha=self.args.critic_lora_alpha, target_modules=critic_model.config.target_modules.split(","), lora_dropout=0.05, task_type=critic_model.config.task_type ) critic_model.enable_input_require_grads() critic_model = get_peft_model(critic_model, config) # critic_model = convert_linear_layer_to_lora( # critic_model, self.args.critic_lora_module_name, # self.args.critic_lora_rank) # if self.args.only_optimize_lora: # critic_model = only_optimize_lora_parameters(critic_model) # Optimizer AdamOptimizer = DeepSpeedCPUAdam if self.args.offload else FusedAdam optim_pararms = get_optimizer_grouped_parameters( critic_model, self.args.critic_weight_decay) optim = AdamOptimizer(optim_pararms, lr=self.args.critic_learning_rate, betas=(0.9, 0.95)) # LR Scheduler lr_scheduler = get_scheduler( name=self.args.lr_scheduler_type, optimizer=optim, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.num_total_iters, ) # DeepSpeed Engine critic_engine, *_ = deepspeed.initialize(model=critic_model, optimizer=optim, lr_scheduler=lr_scheduler, config=ds_config) log_init("Critic", self.args.local_rank, stime=stime) return critic_engine def _init_reward(self, critic_model_name_or_path): stime = log_init("Reward", self.args.local_rank) # DS Config zero_stage = self.args.critic_zero_stage if zero_stage != 3: # If critic is ZeRO-3 then we use it for everything, otherwise assume we have enough memory zero_stage = 0 ds_config = get_eval_ds_config(global_batch_size=self.args.global_train_batch_size_critic, micro_batch_size=self.args.ppo_train_batch_size, gradient_accumulation_steps=self.args.eval_accumulation_steps, offload=self.args.offload, stage=zero_stage) #TODO(jeff): should not be needed, we should be able to use ds_config above #TODO(jeff): it means we never create the critic w. zero.init context if we are using ZeRO-3 ds_eval_config = get_eval_ds_config(global_batch_size=self.args.global_train_batch_size_critic, micro_batch_size=self.args.ppo_train_batch_size, gradient_accumulation_steps=self.args.eval_accumulation_steps, offload=False, stage=0) # Model reward_model = create_critic_model( model_name_or_path=critic_model_name_or_path, tokenizer=self.tokenizer, ds_config=ds_eval_config, # num_padding_at_beginning=self.args.num_padding_at_beginning, rlhf_training=True, checkpoint=self.args.critic_checkpoint, lora_rank=self.args.critic_lora_rank, lora_alpha=self.args.lora_alpha, lora_train_bias=self.args.lora_train_bias) reward_engine, *_ = deepspeed.initialize(model=reward_model, config=ds_config) log_init("Reward", self.args.local_rank, stime=stime) return reward_engine ================================================ FILE: src/models/trainer.py ================================================ from collections import defaultdict import deepspeed import sys import json import os import ray import torch import logging import uuid import torch.nn.functional as F from abc import abstractmethod from datasets import Dataset from deepspeed.runtime.zero import ZeroParamStatus from time import time from torch import nn from tqdm import tqdm from typing import Any, Callable, Iterable, Dict, List, Optional, Tuple, Union, Literal from torch.utils.data import DataLoader from accelerate import Accelerator # type: ignore from ray.air import session try: from ray.train import Checkpoint except ImportError: from ray.air.checkpoint import Checkpoint from rich.console import Console from rich.table import Table from transformers import ( AutoTokenizer, PretrainedConfig, Trainer, PreTrainedModel, TrainingArguments, DataCollator, PreTrainedTokenizerBase, TrainerCallback, ) from trl.models import create_reference_model from trl.trainer.utils import disable_dropout_in_model, pad_to_length from src.utils.logger import logger from src.utils.config import TRLConfig from src.data.pipeline import BaseRolloutStore from src.utils.file_utils import significant, print_gpu_utilization, print_gpu_utilization_torch from src.utils.modeling_utils import ( filter_non_scalars, get_distributed_config, get_git_tag, get_optimizer_class, get_scheduler_class, flatten_dict, freeze_bottom_causal_layers, freeze_bottom_seq2seq_layers, get_delta_model_class, parse_delta_kwargs, ) from src.data.data_types import PromptBatch, PPORLBatch, PPORLElement from src.models.ppo import ( AdaptiveKLController, AutoModelForCausalLMWithHydraValueHead, AutoModelForSeq2SeqLMWithHydraValueHead, FixedKLController, ) from src.data.pipeline import BasePipeline, PPORolloutStorage from src.utils.modeling_utils import Clock, RunningMoments, logprobs_of_labels from src.utils.logger import logger # specifies a dictionary of architectures _TRAINERS: Dict[str, Any] = {} # registry def register_trainer(name): """Decorator used to register a trainer Args: name: Name of the trainer type to register """ def register_class(cls, name): _TRAINERS[name] = cls setattr(sys.modules[__name__], name, cls) return cls if isinstance(name, str): name = name.lower() return lambda c: register_class(c, name) cls = name name = cls.__name__ register_class(cls, name.lower()) return cls @register_trainer class BaseRLTrainer: def __init__( self, config: TRLConfig, reward_fn=None, metric_fn=None, logit_mask=None, stop_sequences=None, train_mode=False, ): self.store: BaseRolloutStore = None self.config = config self.reward_fn = reward_fn self.metric_fn = metric_fn self.train_mode = train_mode self.logit_mask = logit_mask self.stop_sequences = stop_sequences def push_to_store(self, data): self.store.push(data) def add_eval_pipeline(self, eval_pipeline): """Adds pipeline for validation prompts""" self.eval_pipeline = eval_pipeline @abstractmethod def sample(self, prompts: Iterable[str], length: int, n_samples: int) -> Iterable[str]: """ Sample from the language. Takes prompts and maximum length to generate. :param prompts: List of prompts to tokenize and use as context :param length: How many new tokens to genrate for each prompt :type length: int :param n_samples: Default behavior is to take number of prompts as this """ pass @abstractmethod def learn( self, log_fn: Callable = None, save_fn: Callable = None, eval_fn: Callable = None, ): """ Use experiences in RolloutStore to learn :param log_fn: Optional function that is called when logging and passed a dict of logging relevant values :type log_fn: Callable[Dict[str, any]] :param save_fn: Optional function to call after saving. Is passed the components. :type save_fn: Callable[Dict[str, any]] :param eval_fn: Optional function to call during evaluation. Eval doesn't do anything without this. :type eval_fn: Callable[BaseRLTrainer] """ pass @abstractmethod def save(self, directory: Optional[str] = None): """Creates a checkpoint of training states""" pass @abstractmethod def load(self, directory=None): """Loads a checkpoint created from `save`""" pass @register_trainer class AccelerateRLTrainer(BaseRLTrainer): """ RL model trainer with an `accelerate` based backend """ def __init__(self, config, **kwargs): # noqa: C901 super().__init__(config, **kwargs) self.max_length = config.train.seq_length self.accelerator = Accelerator(log_with=config.train.tracker, logging_dir=config.train.logging_dir) if self.accelerator.state.deepspeed_plugin is not None: # by accelerate's default, arguments in `model.forward` would be casted to half if "fp16" in self.accelerator.state.deepspeed_plugin.deepspeed_config: self.accelerator.state.deepspeed_plugin.deepspeed_config["fp16"]["auto_cast"] = False if int(os.environ.get("WORLD_SIZE", 1)) > 1: torch.distributed.barrier(device_ids=[int(os.environ.get("LOCAL_RANK", 0))]) self.model = self.setup_model() self.opt = self.setup_optimizer() self.scheduler = self.setup_scheduler() self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer.tokenizer_path, trust_remote_code=True) self.tokenizer.padding_side = config.tokenizer.padding_side self.tokenizer.truncation_side = config.tokenizer.truncation_side self.padding_side = config.tokenizer.padding_side # self.tokenizer.sep_token = "" # if config.model.model_arch_type != "seq2seq": # self.tokenizer.pad_token = self.tokenizer.eos_token script_name = os.path.basename(sys.argv[0]).rsplit(".", 1)[0] if not isinstance(config.model.model_path, str): model_name = str(config.model.model_path).split()[0] else: model_name = config.model.model_path.split("/")[-1] if self.accelerator.num_processes == 1: num_gpus = "1gpu" else: num_gpus = f"{self.accelerator.num_processes}gpus" branch = get_git_tag()[0] run_name = "/".join([script_name, model_name, num_gpus]) + f":{branch}" if self.accelerator.is_main_process and not ray.is_initialized(): config_dict = self.config.to_dict() dist_config = get_distributed_config(self.accelerator) config_dict["distributed"] = dist_config init_trackers_kwargs = {} if config.train.tracker == "wandb": init_trackers_kwargs["wandb"] = { "name": run_name, "entity": self.config.train.entity_name, "group": self.config.train.group_name, "tags": ["/".join(get_git_tag())], "mode": "disabled" if os.environ.get("debug", False) else "online", } self.accelerator.init_trackers( project_name=self.config.train.project_name, config=config_dict, init_kwargs=init_trackers_kwargs, ) elif config.train.tracker == "tensorboard": # flatten config for tensorboard, split list in hparams into flatten config config_dict_flat = flatten_dict(config_dict) config_dict_flat["optimizer/kwargs/beta_1"] = config_dict_flat["optimizer/kwargs/betas"][0] config_dict_flat["optimizer/kwargs/beta_2"] = config_dict_flat["optimizer/kwargs/betas"][1] config_dict_flat.pop("optimizer/kwargs/betas", None) self.accelerator.init_trackers( project_name=self.config.train.project_name, config=config_dict_flat, ) elif config.train.tracker is None: self.accelerator.init_trackers(project_name=self.config.train.project_name) else: raise ValueError( f"Only supported trackers are `wandb` and `tensorboard`. Got: `{config.train.tracker}`. " "Set `tracker` to `None` to disable tracking." ) def setup_model(self): """ Returns a model derived from an instance's TRLConfig """ logger.info(f"Initializing model: {self.config.model.model_path}") # Retrieves model equipped for ppo, ilql, etc model = self.get_arch(self.config) # if self.config.model.model_arch_type == "seq2seq": # freeze_bottom_seq2seq_layers(model.base_model, self.config.model.num_layers_unfrozen) # else: freeze_bottom_causal_layers(model.base_model, self.config.model.num_layers_unfrozen) # Set the delta tuning strategies if self.config.model.delta_kwargs is not None: delta_type, delta_kwargs = parse_delta_kwargs( model.base_model.config, self.config.model.delta_kwargs, self.config.model.num_layers_unfrozen, ) delta_model_class = get_delta_model_class(delta_type) delta_model = delta_model_class(model.base_model, **delta_kwargs) delta_model.freeze_module(exclude=["deltas"], set_state_dict=True) if self.accelerator.is_main_process: delta_model.log() return model def setup_optimizer(self): """ Returns an optimizer derived from an instance's TRLConfig """ optimizer_class = get_optimizer_class(self.config.optimizer.name) optimizer = optimizer_class( self.model.parameters(), **self.config.optimizer.kwargs, ) if "bitsandbytes" in optimizer.__class__.__module__: # Force 32-bit `nn.Embedding` weights for stability. See discussion: # https://github.com/huggingface/transformers/issues/14819#issuecomment-1016017746 from bitsandbytes.optim import GlobalOptimManager manager = GlobalOptimManager.get_instance() for module in self.model.modules(): if isinstance(module, torch.nn.Embedding): manager.register_module_override(module, "weight", {"optim_bits": 32}) return optimizer def setup_scheduler(self): """ Returns a learning rate scheduler derived from an instance's TRLConfig """ scheduler_class = get_scheduler_class(self.config.scheduler.name) scheduler = scheduler_class(self.opt, **self.config.scheduler.kwargs) return scheduler def decode( self, prompts: List[torch.LongTensor], samples: List[torch.LongTensor], prompt_sizes: torch.LongTensor = None, ) -> Tuple[List[str], List[str], List[str], List[str], List[List[torch.Tensor]]]: """ Decode tensor generations into lists of strings (`samples`: List[str], `prompts`: List[str], `outputs`: List[str]) """ # Assuming prompts were left-padded prompt_sizes = [] prefix_indices = [] for prompt in prompts: prefix_idx = None if "chatglm" in self.config.model.model_path: prompt_sizes.append(len(prompt)) else: logger.debug(f"[decode] prompt: {prompt}") if isinstance(prompt, torch.Tensor): prompt = prompt.cpu().detach().tolist() prompt_sizes.append(prompt.index(self.tokenizer.sep_token_id)) if "glm" in self.config.model.model_path: try: prefix_idx = prompt.index(self.tokenizer.mask_token_id) except IndexError: pass prefix_indices.append(prefix_idx) str_samples, str_prompts, str_outputs, str_prefixes, sample_outputs = [], [], [], [], [] for prompt, sample, prompt_size, prefix_idx in zip(prompts, samples, prompt_sizes, prefix_indices): # if self.config.model.model_arch_type == "seq2seq": # output_start_ix = 0 # else: output_start_ix = prompt_size str_prompt = self.tokenizer.decode(prompt[:prompt_size], skip_special_tokens=True) if prefix_idx is not None: str_prefix = self.tokenizer.decode(sample[output_start_ix:prefix_idx], skip_special_tokens=True) sample_output = sample[prefix_idx:] str_output = self.tokenizer.decode(sample_output, skip_special_tokens=True) else: str_prefix = None sample_output = sample[output_start_ix:] str_output = self.tokenizer.decode(sample_output, skip_special_tokens=True) # Trim outputs up to `self.stop_sequences` if any are present if self.stop_sequences: for stop in self.stop_sequences: stop_ix = str_output.find(stop) if stop_ix >= 0: str_output = str_output[:stop_ix].rstrip() str_prompts.append(str_prompt) str_outputs.append(str_output) str_prefixes.append(str_prefix) sample_outputs.append(sample_output) if "chatglm" in self.config.model.model_path: sample = str_prompt + str_output else: sample = str_prompt + self.tokenizer.sep_token + str_output str_samples.append(sample) return str_samples, str_prompts, str_outputs, str_prefixes, sample_outputs def generate(self, input_ids, attention_mask=None, **kwargs): """Wraps hf's `generate` adding some specific method's defaults""" input_ids = input_ids.to(self.accelerator.device) if attention_mask is not None: attention_mask = attention_mask.to(self.accelerator.device) if self.generate_experience_kwargs is not None: kwargs = dict(self.generate_experience_kwargs, **kwargs) else: kwargs = dict(self.generate_kwargs, **kwargs) with torch.no_grad(): return self.accelerator.unwrap_model(self.model).generate( input_ids=input_ids, attention_mask=attention_mask, **kwargs ) def generate_eval(self, input_ids, attention_mask=None, **kwargs): """Wraps hf's `generate` adding some specific method's defaults""" input_ids = input_ids.to(self.accelerator.device) if attention_mask is not None: attention_mask = attention_mask.to(self.accelerator.device) kwargs = dict(self.generate_kwargs, **kwargs) with torch.no_grad(): return self.accelerator.unwrap_model(self.model).generate( input_ids=input_ids, attention_mask=attention_mask, **kwargs ) def save_pretrained(self, directory: Optional[str] = None, **kwargs): """Save the underlying Hugging Face model, tokenizer, and configuration files to a directory for later use. Args: directory (str, *optional*): The directory to save the trainer files to. NOTE: If not specified, the model will be saved to a directory named `hf_model` in the checkpoint directory as specified by the Trainer's config. **kwargs: Additional keyword arguments passed to the underlying Hugging Face model's `save_pretrained` method. """ if directory is None: directory = os.path.join(self.config.train.checkpoint_dir, "hf_model") self.accelerator.wait_for_everyone() self.accelerator.unwrap_model(self.model).save_pretrained(directory, **kwargs) if self.accelerator.is_main_process: self.tokenizer.save_pretrained(directory) def save(self, directory: Optional[str] = None, **kwargs): """Creates a checkpoint of the optimizer, scheduler and model""" self.accelerator.save_state(directory or self.config.train.checkpoint_dir, **kwargs) def load(self, directory: Optional[str] = None, **kwargs): """Load checkpoint of optimizer, scheduler and a model""" self.accelerator.load_state(directory or self.config.train.checkpoint_dir, **kwargs) def add_eval_pipeline(self, eval_pipeline): """Adds pipeline from with validation prompts""" self.eval_pipeline = eval_pipeline def evaluate(self): # noqa: C901 """Samples model on `eval_prompts`, logs stats with `reward_fn` or `metric_fn` if provided""" logger.info("Evaluating model") # Do multiple evaluations over a single list in `gen_kwargs` if present if self.generate_sweep_kwarg is not None: gen_sweep_arg, gen_sweep_values = self.generate_sweep_kwarg else: gen_sweep_values = [None] desc = [ f"generation sweep 0/{len(gen_sweep_values)}", f"eval batch 0/{len(self.eval_dataloader)}", ] tbar = tqdm( total=len(self.eval_dataloader) * len(gen_sweep_values), desc=f"[{' | '.join(desc)}]", disable=not self.accelerator.is_main_process, position=0, leave=True, ) stats = {} table = [] for i_sweep, gen_sweep_value in enumerate(gen_sweep_values): # A dedicated suffix for wandb logging if gen_sweep_value is not None: sweep_suffix = f"@{gen_sweep_arg}={gen_sweep_value}" else: sweep_suffix = "" all_samples = [] all_prompts = [] all_prompt_sizes = [] generate_time = time() for i_prompt, prompts in enumerate(self.eval_dataloader): logger.debug(f"evaluate() - prompts keys: {prompts.keys()}, input_ids: {prompts['input_ids'].shape}") if self.generate_sweep_kwarg: samples = self.generate_eval(**prompts, **{gen_sweep_arg: gen_sweep_value}) else: samples = self.generate_eval(**prompts) # if self.config.model.model_arch_type == "seq2seq": # samples = samples[:, 1:].contiguous() prompt_sizes = torch.tensor(prompts['input_ids'].shape[1]).repeat(len(prompts['input_ids'])) prompts, samples, prompt_sizes = self.accelerator.gather_for_metrics( self.accelerator.pad_across_processes( [prompts['input_ids'], samples, prompt_sizes.to(samples.device)], dim=1, pad_index=self.tokenizer.pad_token_id, ) ) all_samples.extend(samples.tolist()) all_prompts.extend(prompts.tolist()) all_prompt_sizes.extend(prompt_sizes.tolist()) desc = [ f"generation sweep {i_sweep + 1}/{len(gen_sweep_values)}", f"eval batch {i_prompt + 1}/{len(self.eval_dataloader)}", ] tbar.set_description(f"[{' | '.join(desc)}]") tbar.update() tbar.close() stats["time/generate"] = time() - generate_time if self.accelerator.is_main_process: str_samples, str_prompts, str_outputs, str_prefixes, _ = self.decode(all_prompts, all_samples, all_prompt_sizes) columns = ["prompt", "output"] columns_data = [str_prompts, str_outputs] # in online setting, compute the reward for validation if self.reward_fn: logger.info("Computing rewards") rewards = torch.tensor( self.reward_fn( samples=str_samples, prompts=str_prompts, outputs=str_outputs, ), dtype=float, ) mean_reward = rewards.mean().item() columns.append("reward") if not isinstance(rewards, list): rewards = rewards.tolist() columns_data.append(rewards) stats[f"reward/mean{sweep_suffix}"] = mean_reward # additionally log any other metrics if self.metric_fn: logger.info("Computing metrics") metric_time = time() metrics = self.metric_fn( samples=str_samples, prompts=str_prompts, outputs=str_outputs, ) stats["time/metric"] = time() - metric_time mean_metrics = { f"metrics/{k}{sweep_suffix}": torch.as_tensor(xs).mean(-1) for k, xs in metrics.items() } stats.update(mean_metrics) for metric, values in metrics.items(): columns.append(metric) if not isinstance(values, list): values = values.tolist() columns_data.append(values) # Prepend the sweep argument along with samples if self.generate_sweep_kwarg: columns.insert(0, gen_sweep_arg) columns_data.insert(0, [gen_sweep_value] * len(samples)) table.append(list(zip(*columns_data))) # Log and display evaluation metrics logger.info("Summarizing evaluation") if self.accelerator.is_main_process: rows = sum(list(map(list, zip(*table))), []) # Add metrics/rewards to the table's title table_title = f"Evaluation #{self.nth_evaluation}" for k, x in stats.items(): if k.startswith("reward") or k.startswith("metrics"): table_title += f" {k}: {significant(x)}" rich_table = Table(*columns, title=table_title, show_lines=True) for ix in range(max(min(3, len(rows)), len(gen_sweep_values))): rich_table.add_row(*[str(significant(x)) for x in rows[ix]]) Console().print(rich_table) if not ray.is_initialized(): if self.config.train.tracker == "wandb": import wandb stats["samples"] = wandb.Table(columns, rows) self.nth_evaluation += 1 return stats def learn(self): # noqa: C901 """ Samples batches from `self.store`, updates model and periodically evaluates it on `self.eval_dataloader` """ logger.info("Starting training") self.generate_sweep_kwarg = None for k, v in self.config.method.gen_kwargs.items(): if isinstance(v, list): if self.generate_sweep_kwarg is not None: logger.info("Only a single sweep is allowed, {k} is going to be set to {v[0]}") self.generate_kwargs[k] = v[0] else: self.generate_sweep_kwarg = (k, v) self.prepare_learning() self.iter_count = 0 self.nth_evaluation = 0 if ray.is_initialized(): checkpoint = session.get_checkpoint() if checkpoint: with checkpoint.as_directory() as dir: self.accelerator.load_state(dir) with open(os.path.join(dir, "state.json")) as f: state = json.load(f) self.iter_count = state["iter_count"] else: results = self.evaluate() self.accelerator.log(results, step=self.iter_count) tbar = tqdm( initial=self.iter_count, total=self.total_steps, disable=not self.accelerator.is_local_main_process, position=0, leave=True, ) best_reward = -float("inf") # For each epoch for _ in range(self.config.train.epochs): # For each batch for batch in self.train_dataloader: # For each update per batch for _ in range(self.n_updates_per_batch): # Note that whereas standard policy gradient methods perform one # gradient update per batch, PPO for example commonly performs # multiple gradient updates on the same batch of data. # https://arxiv.org/pdf/1707.06347.pdf forward_time = time() loss, stats = self.loss(batch) forward_time = time() - forward_time backward_time = time() self.accelerator.backward(loss) backward_time = time() - backward_time self.opt.step() self.opt.zero_grad() self.scheduler.step() self.iter_count += 1 if self.iter_count % self.config.train.checkpoint_interval == 0: subfolder = f"checkpoint_{self.iter_count:0{len(str(self.total_steps))}d}" directory = os.path.join(self.config.train.checkpoint_dir, subfolder) self.save(directory) stats["time/forward"] = forward_time stats["time/backward"] = backward_time for group_number, lr in enumerate(self.scheduler.get_last_lr()): stats[f"learning_rate_group_{group_number}"] = lr if self.iter_count % self.config.train.eval_interval == 0: results = self.evaluate() stats.update(results) # always save checkpoint with the greatest mean reward if self.config.train.save_best: if stats.get("reward/mean", -float("inf")) > best_reward: best_reward = stats.get("reward/mean") do_save = True # in case ILQL reports reward estimate as one of its metrics elif stats.get("metrics/reward", -float("inf")) > best_reward: best_reward = stats.get("metrics/reward") do_save = True else: do_save = False do_save = torch.tensor(do_save, device=self.accelerator.device) if torch.distributed.is_initialized(): torch.distributed.all_reduce(do_save, torch.distributed.ReduceOp.MAX) if do_save: best_path = f"{self.config.train.checkpoint_dir}/best_checkpoint" logger.info(f"Saving the best state so far into {best_path}") self.save(best_path) # Report the metrics to Ray Tune. if ray.is_initialized(): self.save("state") with open("state/state.json", "w") as f: json.dump(dict(iter_count=self.iter_count), f) checkpoint = Checkpoint.from_directory("state") session.report(filter_non_scalars(stats), checkpoint=checkpoint) if not ray.is_initialized(): self.accelerator.log(stats, step=self.iter_count) desc = " | ".join(f"{k}: {v:.2f}" for k, v in stats.items() if k.startswith("loss")) tbar.set_description(f"[{desc}]") tbar.update() if self.iter_count >= self.total_steps: subfolder = f"checkpoint_{self.iter_count:0{len(str(self.total_steps))}d}" directory = os.path.join(self.config.train.checkpoint_dir, subfolder) self.save(directory) return self.evaluate() self.post_backward_callback() self.post_epoch_callback() tbar.close() @abstractmethod def get_arch(self, config: TRLConfig): """Returns a specific wrapper of the decoder architecture""" pass @abstractmethod def loss(self, batch) -> Tuple[float, Dict]: """Compute loss on a batch from `store` and return some statistics""" pass @abstractmethod def post_backward_callback(self): """Do something after model update""" pass @abstractmethod def post_epoch_callback(self): """Do something after exhausting/single pass over `self.store`""" pass @register_trainer class AcceleratePPOTrainer(AccelerateRLTrainer): """PPO Accelerate Trainer""" reward_fn: Callable[[List[str], List[str], List[str]], List[float]] tokenizer: AutoTokenizer def __init__(self, config: TRLConfig, **kwargs): """PPO Accelerate Trainer initialization Args: config: Config """ super().__init__(config, **kwargs) # Setup rollout logging if config.train.rollout_logging_dir is not None: self.log_rollouts = True self.setup_rollout_logging(config) else: self.log_rollouts = False # Setup the rollout store # Rollouts contain the prompt & response, log probs, values and rewards - from each rollout self.store = PPORolloutStorage(self.tokenizer.pad_token_id) # Create the rollout store dataloader (for batching up rollouts) # TODO (jon-tow): This is only used to satisfy to `accelerator.prepare` call constraint below - remove in future rollout_loader: DataLoader = self.store.create_loader(self.config.train.batch_size, shuffle=True) # Prepare multi-GPU acceleration self.model, self.opt, self.scheduler, rollout_loader = self.accelerator.prepare( self.model, self.opt, self.scheduler, rollout_loader ) self.store.clear_history() # Clear the rollout store # Setup a reference model when hydra heads are not used if not hasattr(self.model, "frozen_head"): self.ref_model = self.get_arch(self.config) self.ref_model.to(self.accelerator.device) self.ref_model.eval() # Setup the KL controller # This helps prevent large divergences in the controller (policy) if config.method.target is not None: self.kl_ctl = AdaptiveKLController(config.method.init_kl_coef, config.method.target, config.method.horizon) else: self.kl_ctl = FixedKLController(config.method.init_kl_coef) # Create the parameters for the Hugging Face language model's generator # method (that generates new tokens from a prompt). # https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/text_generation#transformers.GenerationMixin.generate if config.model.model_arch_type == "seq2seq": self.generate_kwargs = dict( config.method.gen_kwargs, eos_token_id=self.tokenizer.eop_token_id, pad_token_id=self.tokenizer.pad_token_id, ) if config.method.gen_experience_kwargs is not None: self.generate_experience_kwargs = dict( config.method.gen_experience_kwargs, eos_token_id=self.tokenizer.eop_token_id, pad_token_id=self.tokenizer.pad_token_id, ) else: self.generate_experience_kwargs = None else: self.generate_kwargs = dict( config.method.gen_kwargs, eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.pad_token_id, ) if config.method.gen_experience_kwargs is not None: self.generate_experience_kwargs = dict( config.method.gen_experience_kwargs, eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.pad_token_id, ) else: self.generate_experience_kwargs = None # Setup stats tracker self.running_moments = RunningMoments() self.ref_mean = self.config.method.ref_mean self.ref_std = self.config.method.ref_std def get_arch(self, config: TRLConfig): """Get the model""" model_class = AutoModelForCausalLMWithHydraValueHead if config.model.model_arch_type == "seq2seq": model_class = AutoModelForSeq2SeqLMWithHydraValueHead from_fn = model_class.from_pretrained # backward-compat: Try to create a randomly initialized architecture from a config if issubclass(type(config.model.model_path), PretrainedConfig): from_fn = model_class.from_config model = from_fn( config.model.model_path, trust_remote_code=True, num_layers_unfrozen=config.model.num_layers_unfrozen, config=config ) return model def loss(self, batch: PPORLBatch): """Forward pass & loss Args: batch: Previous batch of episodes """ # Move `batch` data to `accelerator` device input_ids = batch.query_tensors.to(self.accelerator.device) response_tensors = batch.response_tensors.to(self.accelerator.device) attention_mask = batch.attention_mask.to(self.accelerator.device) old_logprobs = batch.logprobs.to(self.accelerator.device) old_values = batch.values.to(self.accelerator.device) old_rewards = batch.rewards.to(self.accelerator.device) response_length = old_rewards.shape[1] logger.debug(f"loss() - input ids shape: {input_ids.shape}, attention mask shape: {attention_mask.shape}") advantages, returns = self.config.method.get_advantages_and_returns(old_values, old_rewards, response_length) if self.config.model.model_arch_type == "seq2seq": # TODO: To be modified for glm and chatglm # input_ids = query_tensors decoder_input_ids = response_tensors # attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long().to(self.accelerator.device) position_ids = torch.stack(batch.position_ids).to(self.accelerator.device) # decoder_attention_mask = ( # decoder_input_ids.ne(self.tokenizer.pad_token_id).long().to(self.accelerator.device) # ) # decoder_attention_mask[:, 0] = 1 logger.debug(f"loss() - position ids shape: {position_ids.shape}") # Forward pass outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids # decoder_input_ids=decoder_input_ids, # decoder_attention_mask=decoder_attention_mask, ) logits = outputs.logits values_pred = outputs.value logprobs = logprobs_of_labels(logits[:, :-1, :], decoder_input_ids[:, 1:]) mask = decoder_input_ids.ne(self.tokenizer.pad_token_id).long().to(self.accelerator.device) start = 0 end = start + response_length logprobs, values_pred, mask = ( logprobs[:, start:end], values_pred[:, start:end], mask[:, start:end], ) else: # tokens = torch.cat((query_tensors, response_tensors), dim=1) # attention_mask = tokens.not_equal(self.tokenizer.pad_token_id).long().to(tokens.device) outputs = self.model(input_ids, attention_mask, return_dict=True) logits = outputs.logits values_pred = outputs.value logger.info(f"loss() - s1 values_pred shape: {values_pred.shape}") values_pred = values_pred[:, :-1] logger.info(f"loss() - s2 values_pred shape: {values_pred.shape}") logprobs = logprobs_of_labels(logits[:, :-1, :], input_ids[:, 1:]) start = input_ids.shape[1] - 1 end = start + response_length logprobs, values_pred, mask = ( logprobs[:, start:end], values_pred[:, start:end], attention_mask[:, start:end], ) logger.info(f"loss() - s3 values_pred shape: {values_pred.shape}") # TODO: need debugging here loss, stats = self.config.method.loss( logprobs=logprobs, values=values_pred, old_logprobs=old_logprobs, old_values=old_values, advantages=advantages, returns=returns, mask=mask, ) return loss, stats def setup_rollout_logging(self, config): # Make rollout logging dir for this run and store config exists = os.path.exists(config.train.rollout_logging_dir) isdir = os.path.isdir(config.train.rollout_logging_dir) assert exists and isdir self.run_id = f"run-{uuid.uuid4()}" self.rollout_logging_dir = os.path.join(config.train.rollout_logging_dir, self.run_id) os.mkdir(self.rollout_logging_dir) with open(os.path.join(self.rollout_logging_dir, "config.json"), "w") as f: f.write(json.dumps(config.to_dict(), indent=2)) def post_epoch_callback(self): """Post epoch callback Clears the store and creates `num_rollouts` new episodes. """ if self.log_rollouts: self.store.export_history(location=self.rollout_logging_dir) self.store.clear_history() # Collect more rollouts for training self.make_experience(self.config.method.num_rollouts, self.iter_count) def post_backward_callback(self): self.kl_ctl.update(self.mean_kl.item(), n_steps=self.config.train.batch_size) def prepare_learning(self): eval_dataloader = self.eval_pipeline.create_loader(self.config.method.chunk_size) self.eval_dataloader = self.accelerator.prepare_data_loader(eval_dataloader) self.train_dataloader = self.store.create_loader(self.config.train.batch_size, shuffle=True) self.n_updates_per_batch = self.config.method.ppo_epochs self.total_steps = self.config.train.epochs * self.n_updates_per_batch * len(self.train_dataloader) self.total_steps = min(self.total_steps, self.config.train.total_steps) def add_prompt_pipeline(self, pipeline: BasePipeline): """Add a prompt pipeline dataloader to a trainer instance for the `make_experience` stage""" prompt_dataloader = pipeline.create_loader(self.config.method.chunk_size, shuffle=True) self.prompt_dataloader = self.accelerator.prepare_data_loader(prompt_dataloader) self.prompt_iterator = iter(self.prompt_dataloader) def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0): # noqa: """Make experiences Takes `chunk_size` number of prompts from `prompt_iterator`, samples from the model and then computes the KL against a reference model. Finally it then appends PPOElements to trainer's `store`. Args: num_rollouts: Number of rollouts to generate iter_count: Total number of updates run (i.e. number of updates run for all batches & epochs) """ logger.info("Collecting rollouts") tbar = tqdm( total=num_rollouts, disable=os.environ.get("RANK", 0) != "0", desc=f"[rollout 0 / {num_rollouts}]", # Lower progress bar by 1 if we're in WARNING mode or above to avoid hiding high priority progress # bars (e.g. loss progress in trainers) position=logger.level >= logging.WARNING, # Leave progress bar if we're in INFO mode or lower to avoid spamming in suppressed verbosity levels leave=logger.level < logging.WARNING, ) ppo_rl_elements = [] stats = {} clock = Clock() while len(ppo_rl_elements) < num_rollouts: # Get next batch in prompt dataset and refresh if exhausted # TOOD (jon-tow): Make `prompt_dataloader` a cyclic/infinite DataLoader to not require manually # "refreshing" the contents of the `prompt_iterator` try: batch: PromptBatch = next(self.prompt_iterator) except StopIteration: self.prompt_iterator = iter(self.prompt_dataloader) batch = next(self.prompt_iterator) exp_generate_time = time() # Generate samples from the language model (similar to using HuggingFace `generate` method) logger.debug(f"generate() input `batch` keys: {batch.keys()}") samples = self.generate(**batch) for i in range(len(batch['input_ids'])): p = self.tokenizer.decode(batch['input_ids'][i], skip_special_tokens=True) gt = self.tokenizer.decode(samples[i], skip_special_tokens=True) logger.debug(f"prompt: {p}, generated result: {gt}, samples: {samples[i]}") logger.debug(f"make_experience() - input ids shape: {batch['input_ids'].shape}, samples shape: {samples.shape}") stats["time/exp_generate"] = time() - exp_generate_time prompt_tensors = batch['input_ids'] device = samples.device prompt_sizes = torch.tensor([prompt_tensors.shape[1]] * len(prompt_tensors), device=device) padded_samples = self.accelerator.pad_across_processes( samples, dim=1, pad_index=self.tokenizer.pad_token_id, pad_first=False ) padded_prompts = self.accelerator.pad_across_processes( prompt_tensors, dim=1, pad_index=self.tokenizer.pad_token_id, pad_first=False ) gathered_samples = self.accelerator.gather(padded_samples) gathered_prompts = self.accelerator.gather(padded_prompts) gathered_prompt_sizes = self.accelerator.gather(prompt_sizes) if self.accelerator.is_main_process: all_str_samples, all_str_prompts, all_str_outputs, all_str_prefixes, _ = self.decode( gathered_prompts, gathered_samples, gathered_prompt_sizes ) exp_score_time = time() all_scores = torch.tensor( self.reward_fn( samples=all_str_samples, prompts=all_str_prompts, outputs=all_str_outputs, ), dtype=torch.float, device=device, ) stats["time/exp_score"] = time() - exp_score_time all_scores = list(all_scores.reshape(self.accelerator.num_processes, -1).unbind()) else: all_scores = None if torch.distributed.is_initialized(): scores = torch.empty(len(samples), device=device) torch.distributed.scatter(scores, all_scores) else: scores = torch.tensor(all_scores[0]) str_samples, str_prompts, str_outputs, str_prefixes, outputs = self.decode(prompt_tensors, samples) # Pad the sample outputs # outputs = self.tokenizer(str_outputs).input_ids # if self.config.model.model_arch_type == "seq2seq": # # add to the start of the output # for i in range(len(outputs)): # outputs[i] = [self.tokenizer.pad_token_id] + outputs[i] # outputs = list(map(torch.LongTensor, outputs)) maxsize = max(map(len, outputs)) outputs = [ F.pad( output, (0, maxsize - len(output)), value=self.tokenizer.pad_token_id, ) for output in outputs ] sample_outputs = torch.vstack(outputs).to(device) # store statistics of the initial rollout as reference if self.ref_mean is None: self.ref_mean, self.ref_std = scores.mean(), scores.std() all_scores_mean, all_scores_std = self.running_moments.update(scores) stats["exp_scores/mean"] = all_scores_mean stats["exp_scores/std"] = all_scores_std stats["exp_scores/running_mean"] = self.running_moments.mean stats["exp_scores/running_std"] = self.running_moments.std if self.config.method.scale_reward == "running": scores /= self.running_moments.std elif self.config.method.scale_reward == "ref": scores /= self.ref_std clip_reward = self.config.method.cliprange_reward if clip_reward: scores = torch.clip(scores, -clip_reward, clip_reward) # Precompute logprobs, values logger.debug(f"sample_outputs shape: {sample_outputs.shape}") logger.debug(f"str_prompts[0]: {str_prompts[0]}, str_outputs[0]: {str_outputs[0]}, input_ids[0]: {batch['input_ids'][0]}, sample_outputs[0]: {sample_outputs[0]}") # logger.debug(f"str_prompts[1]: {str_prompts[1]}, str_outputs[1]: {str_outputs[1]}, input_ids[1]: {batch['input_ids'][1]}, sample_outputs[1]: {sample_outputs[1]}") self.tokenizer.padding_side = "right" if self.config.model.model_arch_type == "seq2seq": input_ids, attention_mask, position_ids = [], [], [] for str_prompt, str_output, str_prefix in zip(str_prompts, str_outputs, str_prefixes): encoded_prompt = self.tokenizer(str_prompt, str_prefix + self.tokenizer.mask_token) prompt_length = len(encoded_prompt['input_ids']) label_length = len(self.tokenizer.tokenize(str_output)) + 1 if prompt_length + label_length > self.max_length: num_tokens_to_remove = prompt_length + label_length - self.max_length for _ in range(num_tokens_to_remove): if prompt_length > label_length: prompt_length -= 1 else: label_length -= 1 else: label_length = self.max_length - prompt_length assert prompt_length > 0 assert label_length > 0 assert prompt_length + label_length <= self.max_length encoded_dict = self.tokenizer(str_prompt, str_prefix + self.tokenizer.mask_token, max_length=prompt_length, truncation="only_first", return_tensors="pt", return_attention_mask=True, return_token_type_ids=False) encoded_dict = self.tokenizer.build_inputs_for_generation(encoded_dict, targets=str_output, max_gen_length=label_length, padding=True) input_ids.append(encoded_dict['input_ids']) attention_mask.append(encoded_dict['attention_mask']) position_ids.append(encoded_dict['position_ids']) input_ids = torch.cat(input_ids).to(device) attention_mask = torch.cat(attention_mask).to(device) position_ids = torch.cat(position_ids).to(device) with torch.no_grad(): outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids ) logits = outputs.logits values = outputs.value if hasattr(self.model, "frozen_head"): ref_logits = self.model.forward_hydra( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, # decoder_input_ids=sample_outputs, # decoder_attention_mask=decoder_attention_mask, return_dict=True, ).logits else: ref_logits = self.ref_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, # decoder_input_ids=sample_outputs, # decoder_attention_mask=decoder_attention_mask, return_dict=True, ).logits else: # all_tokens = torch.cat((prompt_tensors.to(device), sample_outputs), dim=1) # attention_mask = all_tokens.not_equal(self.tokenizer.pad_token_id).long().to(device) encoded_dict = self.tokenizer(str_prompts, str_outputs, max_length=self.max_length, return_tensors="pt", truncation="longest_first", padding="max_length", return_token_type_ids=False) input_ids = encoded_dict['input_ids'].to(device) attention_mask = encoded_dict['attention_mask'].to(device) position_ids = None with torch.no_grad(): logits, *_, values = self.model( input_ids, attention_mask=attention_mask, ) # TODO(dahoas): When hydra model works need to also support generation on hydra head if hasattr(self.model, "frozen_head"): ref_logits = self.model.forward_hydra( input_ids, attention_mask=attention_mask, return_dict=True, ).logits else: ref_logits = self.ref_model( input_ids, attention_mask=attention_mask, return_dict=True, ).logits ref_logits = ref_logits.to(device) self.tokenizer.padding_side = self.padding_side if self.config.model.model_arch_type == "seq2seq": # TODO: to be tested against glm and chatglm logprobs = logprobs_of_labels(logits[:, :-1, :], sample_outputs[:, 1:]) ref_logprobs = logprobs_of_labels(ref_logits[:, :-1, :], sample_outputs[:, 1:]) else: logprobs = logprobs_of_labels(logits[:, :-1, :], input_ids[:, 1:]) ref_logprobs = logprobs_of_labels(ref_logits[:, :-1, :], input_ids[:, 1:]) n_samples: int = samples.shape[0] logprobs = logprobs.cpu() ref_logprobs = ref_logprobs.cpu() # prompt_tensors = prompt_tensors.cpu() sample_outputs = sample_outputs.cpu() input_ids = input_ids.cpu() attention_mask = attention_mask.cpu() position_ids = position_ids.cpu() if position_ids is not None else None values = values.cpu()[:, :-1] # Estimate the KL divergence between the model and reference model if self.config.model.model_arch_type == "seq2seq": # TODO: to be modified for glm and chatglm attention_mask_tmp = sample_outputs != self.tokenizer.pad_token_id start = 0 else: attention_mask_tmp = attention_mask start = prompt_tensors.shape[1] - 1 ends = start + attention_mask_tmp[:, start:].sum(1) # Get the logprobs and values, for tokens that are not padding # or beginning of sequences tokens. These are from the model (not the reference model) all_values = [values[ix, start : ends[ix]] for ix in range(n_samples)] all_logprobs = [logprobs[ix, start : ends[ix]] for ix in range(n_samples)] log_ratio = (logprobs - ref_logprobs) * attention_mask_tmp[:, :-1].cpu() self.mean_kl = (log_ratio.exp() - 1 - log_ratio).mean().to(device) kl_penalty = self.kl_ctl.value * -log_ratio kl_penalty = [xs[start : ends[ix]] for ix, xs in enumerate(kl_penalty)] rollout_count = 0 for sample_idx in range(n_samples): if len(kl_penalty[sample_idx]) == 0 or len(all_logprobs[sample_idx]) == 0: continue rewards = kl_penalty[sample_idx] rewards[-1] += scores[sample_idx].cpu() logger.debug(f"make_experience() - attention mask shape: {attention_mask[sample_idx].shape}") ppo_rl_elements.append( PPORLElement( query_tensor=input_ids[sample_idx], # query_tensor=prompt_tensors[sample_idx], response_tensor=sample_outputs[sample_idx], attention_mask=attention_mask[sample_idx], position_ids=position_ids[sample_idx] if position_ids is not None else None, logprobs=all_logprobs[sample_idx], values=all_values[sample_idx], rewards=rewards, ) ) rollout_count += 1 exp_time = clock.tick() tbar.set_description(f"[rollout {len(ppo_rl_elements)} / {num_rollouts}]") tbar.update(min(rollout_count, num_rollouts)) tbar.close() if torch.distributed.is_initialized(): torch.distributed.all_reduce(self.mean_kl, torch.distributed.ReduceOp.AVG) stats["policy/sqrt_kl"] = torch.sqrt(self.mean_kl) stats["kl_ctl_value"] = self.kl_ctl.value stats["time/exp"] = exp_time if not ray.is_initialized(): self.accelerator.log(stats, step=iter_count) # Push samples and rewards to trainer's rollout storage self.push_to_store(ppo_rl_elements) def get_model_norm(model): with torch.no_grad(): total = 0.0 for param in model.parameters(): should_gather = hasattr( param, 'ds_id') and param.ds_status == ZeroParamStatus.NOT_AVAILABLE with deepspeed.zero.GatheredParameters(param, enabled=should_gather): total += float(param.float().norm()) return total def gather_log_probs(logits, labels): log_probs = F.log_softmax(logits, dim=-1) log_probs_labels = log_probs.gather(dim=-1, index=labels.unsqueeze(-1)) return log_probs_labels.squeeze(-1) class DeepSpeedPPOTrainer(): def __init__(self, rlhf_engine, args): self.rlhf_engine = rlhf_engine self.actor_model = self.rlhf_engine.actor self.critic_model = self.rlhf_engine.critic self.ref_model = self.rlhf_engine.ref self.reward_model = self.rlhf_engine.reward self.tokenizer = self.rlhf_engine.tokenizer self.args = args self.max_answer_seq_len = args.max_gen_length # self.end_of_conversation_token_id = self.tokenizer( # args.end_of_conversation_token)['input_ids'][-1] self.end_of_conversation_token_id = self.tokenizer.eos_token_id # Those value can be changed self.kl_ctl = args.kl_coefficient self.clip_reward_value = args.clip_reward_value self.cliprange = args.clip_range self.cliprange_value = args.clip_range_value self.gamma = args.gamma self.lam = args.lambda_ def generate_sequence(self, inputs): self.eval() print_gpu_utilization("generate_sequence - before model.generate", self.args.local_rank) print_gpu_utilization_torch("generate_sequence - before model.generate", self.args.local_rank) batch_size = inputs['input_ids'].shape[0] prompt_length = inputs['input_ids'].shape[-1] with torch.no_grad(): logger.debug(f"[generate_sequence] inputs: {inputs}") prompts = [] answers = [] outputs = dict() for i in range(batch_size): input = {k: v[i].unsqueeze(0) for k, v in inputs.items()} prompt = self.tokenizer.decode(input['input_ids'][0], skip_special_tokens=False) if "pangu" in self.args.actor_model_path: seq = self.actor_model.module.generate(**input, max_new_tokens=self.max_answer_seq_len, pad_token_id=self.tokenizer.pad_token_id, do_sample=self.args.do_sample, num_return_sequences=self.args.num_return_sequences, top_p=self.args.top_p, temperature=self.args.temperature) for output_ids in seq: answer = self.tokenizer.decode(output_ids[prompt_length:], skip_special_tokens=True) # Since prompt has , cannot use tokenizer(prompts, answers). Therefore concat prompt and answer, use tokenizer(prompt+answer) instead prompts.append(prompt + answer) elif "chatglm" in self.args.actor_model_path: seq = self.actor_model.module.generate(**input, max_new_tokens=self.max_answer_seq_len, eos_token_id=self.tokenizer.eop_token_id, pad_token_id=self.tokenizer.pad_token_id, do_sample=self.args.do_sample, num_return_sequences=self.args.num_return_sequences, top_p=self.args.top_p, temperature=self.args.temperature) logger.debug(f"[generate_sequence] seq: {seq}") for output_ids in seq: answer = self.tokenizer.decode(output_ids[prompt_length:], skip_special_tokens=True) prompts.append(prompt) answers.append(answer) elif "glm" in self.args.actor_model_path: seq = self.actor_model.module.generate(**input, max_new_tokens=self.max_answer_seq_len, eos_token_id=self.tokenizer.eop_token_id, pad_token_id=self.tokenizer.pad_token_id, do_sample=self.args.do_sample, num_return_sequences=self.args.num_return_sequences, top_p=self.args.top_p, temperature=self.args.temperature) for output_ids in seq: answer = self.tokenizer.decode(output_ids[prompt_length:], skip_special_tokens=True) label_length = len(self.tokenizer.tokenize(answer)) + 1 if prompt_length + label_length > self.args.max_length: num_tokens_to_remove = prompt_length + label_length - self.args.max_length for _ in range(num_tokens_to_remove): if prompt_length > label_length: prompt_length -= 1 else: label_length -= 1 else: label_length = self.args.max_length - prompt_length assert prompt_length > 0 assert label_length > 0 assert prompt_length + label_length == self.args.max_length encoded_dict = self.tokenizer(prompt, max_length=prompt_length, return_tensors="pt", return_attention_mask=True, return_token_type_ids=False, add_special_tokens=False) encoded_dict = self.tokenizer.build_inputs_for_generation(encoded_dict, targets=answer, max_gen_length=label_length, padding=True) for key, val in encoded_dict.items(): if key not in outputs: outputs[key] = [] outputs[key].append(val[0]) else: raise ValueError(f"Unsupported model name: {self.args.actor_model_path}") if "pangu" in self.args.actor_model_path: outputs = self.tokenizer(prompts, max_length=self.args.max_length, padding="max_length", return_tensors="pt", return_token_type_ids=False) logger.debug(f"[generate_sequence] outputs['input_ids'].shape: {outputs['input_ids'].shape}, outputs: {outputs}") elif "chatglm" in self.args.actor_model_path: outputs = self.tokenizer(prompts, answers, max_length=self.args.max_length, padding="max_length", return_tensors="pt") logger.debug(f"[generate_sequence] outputs['input_ids'].shape: {outputs['input_ids'].shape}, outputs: {outputs}") elif "glm" in self.args.actor_model_path: outputs = {key: torch.stack(val) for key, val in outputs.items()} logger.debug(f"[generate_sequence] outputs['input_ids'].shape: {outputs['input_ids'].shape}, outputs: {outputs}") else: raise ValueError(f"Unsupported model name: {self.args.actor_model_path}") print_gpu_utilization("generate_sequence - after model.generate", self.args.local_rank) print_gpu_utilization_torch("generate_sequence - after model.generate", self.args.local_rank) # Filter out seq with no asnwers (or very short). This happens when users directly use the pre-training ckpt without supervised finetuning # NOTE: this will causes each GPU has different number of examples # ans = seq[:, prompt_length:] # self.prompt_length = prompt_length # valid_ans_len = (ans != self.tokenizer.pad_token_id).sum(dim=-1) # out_seq = [] # for i in range(batch_size): # # if the answer is shorter than 1 token, drop it # if valid_ans_len[i] <= 1: # continue # else: # out_seq.append(seq[i:i + 1]) # out_seq = torch.cat(out_seq, dim=0) # concat output in the batch dim # logger.debug(f"[generate_sequence] out_seq: {out_seq}") return outputs, prompt_length def generate_experience(self, output_sequences, answer_start_indices): self.eval() print_gpu_utilization("generate_experience - before call actor and critic", self.args.local_rank) print_gpu_utilization_torch("generate_experience - before call actor and critic", self.args.local_rank) # pad_token_id = self.tokenizer.pad_token_id input_ids = output_sequences['input_ids'] attention_mask = output_sequences['attention_mask'] if "attention_mask" in output_sequences else None position_ids = output_sequences['position_ids'] if "position_ids" in output_sequences else None print_gpu_utilization("generate_experience - after setting output_sequences device", self.args.local_rank) print_gpu_utilization_torch("generate_experience - after setting output_sequences device", self.args.local_rank) with torch.no_grad(): output = self.actor_model(input_ids, attention_mask=attention_mask, position_ids=position_ids) output_ref = self.ref_model(input_ids, attention_mask=attention_mask, position_ids=position_ids) output_reward = self.reward_model(input_ids, attention_mask, position_ids) reward_score = output_reward['chosen_reward'].detach() if self.critic_model is not None: values = self.critic_model(input_ids, attention_mask, position_ids)['chosen_values'].detach() else: values = output_reward['chosen_values'].detach() print_gpu_utilization("generate_experience - after call actor and critic", self.args.local_rank) print_gpu_utilization_torch("generate_experience - after call actor and critic", self.args.local_rank) logits = output.logits logits_ref = output_ref.logits return { # 'prompts': inputs['input_ids'], 'answer_start_indices': answer_start_indices, 'input_ids': input_ids, 'attention_mask': attention_mask, 'position_ids': position_ids, 'logprobs': gather_log_probs(logits[:, :-1, :], input_ids[:, 1:]), 'ref_logprobs': gather_log_probs(logits_ref[:, :-1, :], input_ids[:, 1:]), 'value': values[:, :-1], 'rewards': reward_score } def compute_rewards(self, starts, log_probs, ref_log_probs, reward_score, action_mask): ''' :param starts: List of indices of the starting index of answer :param log_probs: shape=batch_size * (max_length-1) :param ref_log_probs: shape=batch_size * (max_length-1) :param reward_score: shape=batch_size :param action_mask: shape=batch_size * (answer_length) :return: ''' logger.debug(f"[compute_rewards] log_probs: {log_probs.shape}, ref_log_probs: {ref_log_probs.shape}, " f"reward_score: {reward_score.shape}, action_mask: {action_mask.shape}") kl_divergence_estimate = -self.kl_ctl * (log_probs - ref_log_probs) rewards = kl_divergence_estimate logger.debug(f"before rewards: {rewards.shape}") # start = prompts.shape[1] - 1 # ends = start + action_mask.sum(1) sums = action_mask.sum(1) reward_clip = torch.clamp(reward_score, -self.clip_reward_value, self.clip_reward_value) batch_size = log_probs.shape[0] for j in range(batch_size): logger.debug(f"j={j}, sums[j]={sums[j]}, rewards[j, start:ends[j]]: {rewards[j, starts[j]:(starts[j]+sums[j])].shape}") rewards[j, starts[j]:(starts[j]+sums[j])][-1] += reward_clip[j] logger.debug(f"after rewards: {rewards.shape}") return rewards def train_rlhf(self, inputs): # process the old outputs answer_start_indices = inputs['answer_start_indices'] log_probs = inputs['logprobs'] # shape=batch_size * (max_length-1) ref_log_probs = inputs['ref_logprobs'] # shape=batch_size * (max_length-1) reward_score = inputs['rewards'] # shape=batch_size values = inputs['value'] # shape=batch_size * (max_length-1) attention_mask = inputs['attention_mask'] # shape=batch_size * max_length or shape=batch_size * max_length * max_length position_ids = inputs['position_ids'] # shape=batch_size * 2 * max_length input_ids = inputs['input_ids'] # shape=batch_size * max_length logger.debug(f"[train_rlhf] answer_start_indices: {answer_start_indices}, " f"log_probs shape: {log_probs.shape}, ref_log_probs shape: {ref_log_probs.shape}, " f"reward_score shape: {reward_score.shape}, values shape: {values.shape}, " f"attention_mask shape: {attention_mask.shape if attention_mask is not None else None}," f"position_ids shape: {position_ids.shape if position_ids is not None else None}," f"input_ids shape: {input_ids.shape}") batch_size = input_ids.size()[0] if attention_mask is not None and len(attention_mask.shape) == 2: # action_mask = attention_mask[:, 1:][:, start:] action_mask = attention_mask[:, 1:] else: # answer_ids = input_ids[:, 1:][:, start:] # batch_size = answer_ids.shape[0] # answer_length = answer_ids.shape[-1] answer_length = input_ids.shape[-1] - 1 action_mask = torch.ones((batch_size, answer_length), dtype=torch.long, device=input_ids.device) for i, j in (input_ids[:, 1:] == self.tokenizer.pad_token_id).nonzero(): action_mask[i, j] = 0 for i in range(batch_size): # set mask of prompt to 0 action_mask[i, :answer_start_indices[i]] = 0 logger.debug(f"[train_rlhf] action_mask shape: {action_mask.shape}") # compute advantages and returns print_gpu_utilization("train_rlhf - before compute reward and advantages", self.args.local_rank) print_gpu_utilization_torch("train_rlhf - before compute reward and advantages", self.args.local_rank) old_values = values with torch.no_grad(): old_rewards = self.compute_rewards(answer_start_indices, log_probs, ref_log_probs, reward_score, action_mask) advantages, returns = self.get_advantages_and_returns(old_values, old_rewards, answer_start_indices) logger.debug(f"[train_rlhf] old_rewards shape: {old_rewards.shape}, advantages shape: {advantages.shape}, returns shape: {returns.shape}") print_gpu_utilization("train_rlhf - after compute reward and advantages", self.args.local_rank) print_gpu_utilization_torch("train_rlhf - after compute reward and advantages", self.args.local_rank) # update actor and critic self.train() batch = {'input_ids': input_ids, "attention_mask": attention_mask, "position_ids": position_ids} actor_prob = self.actor_model(**batch, use_cache=False).logits # shape=batch_size * max_length * vocab_size print_gpu_utilization("train_rlhf - after self.actor_model", self.args.local_rank) print_gpu_utilization_torch("train_rlhf - after self.actor_model", self.args.local_rank) actor_log_prob = gather_log_probs(actor_prob[:, :-1, :], input_ids[:, 1:]) actor_loss = self.actor_loss_fn(actor_log_prob, log_probs, advantages, action_mask) self.actor_model.backward(actor_loss) print_gpu_utilization("train_rlhf - after actor backward", self.args.local_rank) print_gpu_utilization_torch("train_rlhf - after actor backward", self.args.local_rank) self.actor_model.step() print_gpu_utilization("train_rlhf - after actor step", self.args.local_rank) print_gpu_utilization_torch("train_rlhf - after actor step", self.args.local_rank) if self.critic_model is not None: value = self.critic_model.reward(**batch, use_cache=False)[0][:, :-1] # shape=batch_size * (max_length-1) print_gpu_utilization("train_rlhf - after self.critic_model", self.args.local_rank) print_gpu_utilization_torch("train_rlhf - after self.critic_model", self.args.local_rank) critic_loss = self.critic_loss_fn(value, old_values, returns, action_mask) self.critic_model.backward(critic_loss) print_gpu_utilization("train_rlhf - after critic backward", self.args.local_rank) print_gpu_utilization_torch("train_rlhf - after critic backward", self.args.local_rank) self.critic_model.step() print_gpu_utilization("train_rlhf - after critic step", self.args.local_rank) print_gpu_utilization_torch("train_rlhf - after critic step", self.args.local_rank) else: critic_loss = None return actor_loss, critic_loss def actor_loss_fn(self, logprobs, old_logprobs, advantages, mask): ## Clipped Surrogate Objective for policy update in PPO (https://arxiv.org/abs/1707.06347) log_ratio = (logprobs - old_logprobs) * mask ratio = torch.exp(log_ratio) pg_objective1 = advantages * ratio pg_objective2 = advantages * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) pg_objective = torch.sum(torch.min(pg_objective1, pg_objective2) * mask) / mask.sum() return -pg_objective def critic_loss_fn(self, values, old_values, returns, mask): # TODO: Clipped surrogate objective for value function (? not seen in original paper) values_clipped = torch.clamp( values, old_values - self.cliprange_value, old_values + self.cliprange_value, ) # Squared-error loss of value function (https://arxiv.org/abs/1707.06347) vf_loss1 = (values - returns)**2 vf_loss2 = (values_clipped - returns)**2 # TODO: using max puts a lower bound and no uppper bound on the loss, is this really desired? vf_loss = 0.5 * torch.sum( torch.max(vf_loss1, vf_loss2) * mask) / mask.sum() return vf_loss def get_advantages_and_returns(self, values, rewards, starts): ''' :param values: shape=batch_size * (max_length-1) :param rewards: shape=batch_size * (max_length-1) :param start: List of indices of the starting index of answer :return: ''' # Generalized advantage estimation (https://arxiv.org/abs/1707.06347) logger.debug(f"[get_advantages_and_returns] values: {values.shape}, rewards: {rewards.shape}, starts: {starts}") batch_size = rewards.size()[0] length = rewards.size()[-1] # lastgaelam = 0 # advantages_reversed = [] # for t in reversed(range(start, length)): # nextvalues = values[:, t + 1] if t < length - 1 else 0.0 # delta = rewards[:, t] + self.gamma * nextvalues - values[:, t] # lastgaelam = delta + self.gamma * self.lam * lastgaelam # advantages_reversed.append(lastgaelam) # advantages = torch.stack(advantages_reversed[::-1], dim=1) # logger.debug(f"advantages: {advantages.shape}, values[:, start:]: {values[:, start:].shape}") # returns = advantages + values[:, start:] advantages = [] returns = [] for i in range(batch_size): lastgaelam = 0 advantages_reversed = [] for t in reversed(range(starts[i], length)): nextvalues = values[i, t + 1] if t < length - 1 else 0.0 delta = rewards[i, t] + self.gamma * nextvalues - values[i, t] lastgaelam = delta + self.gamma * self.lam * lastgaelam advantages_reversed.append(lastgaelam) # set advantage of prompt to 0 (will be ignored when multiplied with action_mask) advantages_reversed.extend([0]*starts[i]) advantage = torch.tensor(advantages_reversed[::-1], device=values.device, dtype=values.dtype) advantages.append(advantage) returns.append(advantage + values[i]) advantages = torch.stack(advantages) returns = torch.stack(returns) return advantages.detach(), returns def _validate_training_mode(self): assert self.actor_model.module.training if self.critic_model is not None: assert self.critic_model.module.training def _validate_evaluation_mode(self): assert not self.actor_model.module.training assert not self.ref_model.module.training if self.critic_model is not None: assert not self.critic_model.module.training assert not self.reward_model.module.training def train(self): self.actor_model.train() if self.critic_model is not None: self.critic_model.train() def eval(self): self.actor_model.eval() self.ref_model.eval() if self.critic_model is not None: self.critic_model.eval() self.reward_model.eval() def dump_model_norms(self, tag): actor_model_norm = get_model_norm(self.actor_model) ref_model_norm = get_model_norm(self.ref_model) if self.critic_model is not None: critic_model_norm = get_model_norm(self.critic_model) reward_model_norm = get_model_norm(self.reward_model) if self.args.global_rank <= 0: logger.info(f'{tag} global_actor_model_norm', actor_model_norm, self.args.local_rank) logger.info(f'{tag} global_ref_model_norm', ref_model_norm, self.args.local_rank) if self.critic_model is not None: logger.info(f'{tag} global_critic_model_norm', critic_model_norm, self.args.local_rank) logger.info(f'{tag} global_reward_model_norm', reward_model_norm, self.args.local_rank) class DeepSpeedPPOPTXTrainer(DeepSpeedPPOTrainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def train_unsupervised(self, inputs, unsup_coef): # Train the unsupervised model here self._validate_training_mode() outputs = self.actor_model(**inputs, use_cache=False) loss = outputs.loss self.actor_model.backward(unsup_coef * loss) self.actor_model.step() return loss class DPOTrainer(Trainer): r""" Initialize DPOTrainer. Args: model (`transformers.PreTrainedModel`): The model to train, preferably an `AutoModelForSequenceClassification`. ref_model (`PreTrainedModelWrapper`): Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation and loss. If no reference model is provided, the trainer will create a reference model with the same architecture as the model to be optimized. beta (`float`, defaults to 0.1): The beta factor in DPO loss. Higher beta means less divergence from the initial policy. args (`transformers.TrainingArguments`): The arguments to use for training. data_collator (`transformers.DataCollator`): The data collator to use for training. If None is specified, the default data collator (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences. label_pad_token_id (`int`, defaults to `-100`): The label pad token id. This argument is required if you want to use the default data collator. padding_value (`int`, defaults to `0`): The padding value. This argument is required if you want to use the default data collator. truncation_mode (`str`, defaults to `keep_end`): The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. train_dataset (`datasets.Dataset`): The dataset to use for training. eval_dataset (`datasets.Dataset`): The dataset to use for evaluation. tokenizer (`transformers.PreTrainedTokenizerBase`): The tokenizer to use for training. This argument is required if you want to use the default data collator. model_init (`Callable[[], transformers.PreTrainedModel]`): The model initializer to use for training. If None is specified, the default model initializer will be used. callbacks (`List[transformers.TrainerCallback]`): The callbacks to use for training. optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): The optimizer and scheduler to use for training. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): The function to use to preprocess the logits before computing the metrics. max_length (`int`, defaults to `None`): The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. max_prompt_length (`int`, defaults to `None`): The maximum length of the prompt. This argument is required if you want to use the default data collator. peft_config (`Dict`, defaults to `None`): The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model. disable_dropout (`bool`, defaults to `True`): Whether or not to disable dropouts in `model` and `ref_model`. """ def __init__( self, model: Union[PreTrainedModel, nn.Module] = None, ref_model: Optional[Union[PreTrainedModel, nn.Module]] = None, logps: Optional[Dict[str, Dict[int, Dict[str, torch.FloatTensor]]]] = None, beta: float = 0.1, args: TrainingArguments = None, data_collator: Optional[DataCollator] = None, label_pad_token_id: int = -100, padding_value: int = 0, # truncation_mode: str = "keep_end", train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = ( None, None, ), preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, # max_length: Optional[int] = None, # max_prompt_length: Optional[int] = None, # peft_config: Optional[Dict] = None, disable_dropout: bool = True, ): self.is_peft_model = getattr(model, "is_peft_model", False) if logps is not None: self.logps = logps self.ref_model = None else: self.logps = None if ref_model: self.ref_model = ref_model elif self.is_peft_model: # The `model` with adapters turned off will be used as the reference model self.ref_model = None else: self.ref_model = create_reference_model(model) # if data_collator is None: # if tokenizer is None: # raise ValueError( # "max_length or a tokenizer must be specified when using the default DPODataCollatorWithPadding" # ) # if max_length is None: # logger.warn( # "When using DPODataCollatorWithPadding, you should set `max_length` in the DPOTrainer's init" # " it will be set to `512` by default, but you should do it yourself in the future.", # UserWarning, # ) # max_length = 512 # if max_prompt_length is None: # logger.warn( # "When using DPODataCollatorWithPadding, you should set `max_prompt_length` in the DPOTrainer's init" # " it will be set to `128` by default, but you should do it yourself in the future.", # UserWarning, # ) # max_prompt_length = 128 # # data_collator = DPODataCollatorWithPadding( # tokenizer, # max_length=max_length, # max_prompt_length=max_prompt_length, # label_pad_token_id=label_pad_token_id, # padding_value=padding_value, # truncation_mode=truncation_mode, # ) # # if args.remove_unused_columns: # args.remove_unused_columns = False # # warn users # warnings.warn( # "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments" # " we have set it for you, but you should do it yourself in the future.", # UserWarning, # ) # # self.use_dpo_data_collator = True # else: # self.use_dpo_data_collator = False if disable_dropout: disable_dropout_in_model(model) if self.ref_model is not None: disable_dropout_in_model(self.ref_model) self.label_pad_token_id = label_pad_token_id self.padding_value = padding_value self.beta = beta self._stored_metrics = defaultdict(lambda: defaultdict(list)) super().__init__( model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, None, callbacks, optimizers, preprocess_logits_for_metrics, ) if not hasattr(self, "accelerator"): raise AttributeError( "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`." ) if self.ref_model is None: if self.is_peft_model and not hasattr( self.accelerator.unwrap_model(self.model).pretrained_model, "disable_adapter", ): raise ValueError( "You are using a `peft` version that does not support `disable_adapter`. Please update your `peft` version to the latest version." ) else: self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) def concatenated_inputs(self, batch: Dict[str, Union[List, torch.LongTensor]]) -> Dict[str, torch.LongTensor]: """Concatenate the chosen and rejected inputs into a single tensor. Args: batch: A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors of shape (batch_size, sequence_length). Returns: A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'. """ max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]) concatenated_batch = {} for k in batch: if k.startswith("chosen") and isinstance(batch[k], torch.Tensor): pad_value = self.label_pad_token_id if "labels" in k else self.padding_value concatenated_key = k.replace("chosen", "concatenated") concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value) for k in batch: if k.startswith("rejected") and isinstance(batch[k], torch.Tensor): pad_value = self.label_pad_token_id if "labels" in k else self.padding_value concatenated_key = k.replace("rejected", "concatenated") concatenated_batch[concatenated_key] = torch.cat( ( concatenated_batch[concatenated_key], pad_to_length(batch[k], max_length, pad_value=pad_value), ), dim=0, ).to(self.accelerator.device) return concatenated_batch def dpo_loss( self, policy_chosen_logps: torch.FloatTensor, policy_rejected_logps: torch.FloatTensor, reference_chosen_logps: torch.FloatTensor, reference_rejected_logps: torch.FloatTensor, reference_free: bool = False, ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Compute the DPO loss for a batch of policy and reference model log probabilities. Args: policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,) policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,) reference_chosen_logps: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,) reference_rejected_logps: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,) beta: Temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. We ignore the reference model as beta -> 0. reference_free: If True, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal probability to all responses. Returns: A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the DPO loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively. """ pi_logratios = policy_chosen_logps - policy_rejected_logps ref_logratios = reference_chosen_logps - reference_rejected_logps if reference_free: ref_logratios = 0 logits = pi_logratios - ref_logratios losses = -F.logsigmoid(self.beta * logits) chosen_rewards = self.beta * (policy_chosen_logps - reference_chosen_logps).detach() rejected_rewards = self.beta * (policy_rejected_logps - reference_rejected_logps).detach() return losses, chosen_rewards, rejected_rewards def _get_batch_logps( self, logits: torch.FloatTensor, labels: torch.LongTensor, average_log_prob: bool = False, ) -> torch.FloatTensor: """Compute the log probabilities of the given labels under the given logits. Args: logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size) labels: Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are ignored. Shape: (batch_size, sequence_length) average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens. Returns: A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits. """ if logits.shape[:-1] != labels.shape: raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.") labels = labels[:, 1:].clone() logits = logits[:, :-1, :] loss_mask = labels != self.label_pad_token_id # dummy token; we'll ignore the losses on these tokens later labels[labels == self.label_pad_token_id] = 0 per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2) if average_log_prob: return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) else: return (per_token_logps * loss_mask).sum(-1) def concatenated_forward( self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]] ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together. We do this to avoid doing two forward passes, because it's faster for FSDP. """ concatenated_batch = self.concatenated_inputs(batch) all_logits = model( concatenated_batch["concatenated_input_ids"], attention_mask=concatenated_batch["concatenated_attention_mask"], ).logits.to(torch.float32) all_logps = self._get_batch_logps( all_logits, concatenated_batch["concatenated_labels"], average_log_prob=False, ) chosen_logps = all_logps[: batch["chosen_input_ids"].shape[0]] rejected_logps = all_logps[batch["chosen_input_ids"].shape[0] :] chosen_logits = all_logits[: batch["chosen_input_ids"].shape[0]] rejected_logits = all_logits[batch["chosen_input_ids"].shape[0] :] return chosen_logps, rejected_logps, chosen_logits, rejected_logits def separate_forward( self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]] ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Run the given model on the given batch of inputs, separately process chosen and rejected inputs. """ chosen_logits = model( batch["chosen_input_ids"], attention_mask=batch.get("chosen_attention_mask", None), ).logits.to(torch.float32) chosen_logps = self._get_batch_logps( chosen_logits, batch["chosen_labels"], average_log_prob=False, ) rejected_logits = model( batch["rejected_input_ids"], attention_mask=batch.get("rejected_attention_mask", None), ).logits.to(torch.float32) rejected_logps = self._get_batch_logps( rejected_logits, batch["rejected_labels"], average_log_prob=False, ) return chosen_logps, rejected_logps, chosen_logits, rejected_logits def get_batch_metrics( self, model, batch: Dict[str, Union[List, torch.LongTensor]], train_eval: Literal["train", "eval"] = "train", ): """Compute the DPO loss and other metrics for the given batch of inputs for train or test.""" metrics = {} ( policy_chosen_logps, policy_rejected_logps, policy_chosen_logits, policy_rejected_logits, ) = self.separate_forward(model, batch) # ) = self.concatenated_forward(model, batch) if self.logps is not None: reference_chosen_logps = [] reference_rejected_logps = [] for idx in batch['index'].detach().cpu().tolist(): reference_chosen_logps.append(self.logps[train_eval][idx]['chosen_logp']) reference_rejected_logps.append(self.logps[train_eval][idx]['rejected_logp']) reference_chosen_logps = torch.stack(reference_chosen_logps).to(self.accelerator.device) reference_rejected_logps = torch.stack(reference_rejected_logps).to(self.accelerator.device) else: with torch.no_grad(): if self.ref_model is None: assert self.is_peft_model, "When ref_model is None, the policy must be a peft model" with self.accelerator.unwrap_model(self.model).pretrained_model.disable_adapter(): ( reference_chosen_logps, reference_rejected_logps, _, _, ) = self.separate_forward(self.model, batch) # ) = self.concatenated_forward(self.model, batch) else: ( reference_chosen_logps, reference_rejected_logps, _, _, ) = self.separate_forward(self.ref_model, batch) # ) = self.concatenated_forward(self.ref_model, batch) losses, chosen_rewards, rejected_rewards = self.dpo_loss( policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps, ) reward_accuracies = (chosen_rewards > rejected_rewards).float() prefix = "eval_" if train_eval == "eval" else "" metrics[f"{prefix}rewards/chosen"] = chosen_rewards.cpu().numpy().mean() metrics[f"{prefix}rewards/rejected"] = rejected_rewards.cpu().numpy().mean() metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.cpu().numpy().mean() metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).cpu().numpy().mean() metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.detach().cpu().numpy().mean() metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.detach().cpu().numpy().mean() metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.detach().cpu().numpy().mean() metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.detach().cpu().numpy().mean() return losses.mean(), metrics def compute_loss( self, model: Union[PreTrainedModel, nn.Module], inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs=False, ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]: loss, metrics = self.get_batch_metrics(model, inputs, train_eval="train") # force log the metrics if self.accelerator.is_main_process: self.store_metrics(metrics, train_eval="train") if return_outputs: return (loss, metrics) return loss def get_batch_samples(self, model, batch: Dict[str, torch.LongTensor]) -> Tuple[str, str]: """Generate samples from the model and reference model for the given batch of inputs.""" policy_output = model.generate( batch["prompt_input_ids"], attention_mask=batch["prompt_attention_mask"], max_length=self.config.max_length, do_sample=True, pad_token_id=self.tokenizer.pad_token_id, ) if self.ref_model is None: with self.accelerator.unwrap_model(self.model).pretrained_model.disable_adapter(): reference_output = self.model.generate( batch["prompt_input_ids"], attention_mask=batch["prompt_attention_mask"], max_length=self.config.max_length, do_sample=True, pad_token_id=self.tokenizer.pad_token_id, ) else: reference_output = self.ref_model.generate( batch["prompt_input_ids"], attention_mask=batch["prompt_attention_mask"], max_length=self.config.max_length, do_sample=True, pad_token_id=self.tokenizer.pad_token_id, ) policy_output = pad_to_length(policy_output, self.config.max_length, self.tokenizer.pad_token_id) policy_output_decoded = self.tokenizer.batch_decode(policy_output, skip_special_tokens=True) reference_output = pad_to_length(reference_output, self.config.max_length, self.tokenizer.pad_token_id) reference_output_decoded = self.tokenizer.batch_decode(reference_output, skip_special_tokens=True) return policy_output_decoded, reference_output_decoded def prediction_step( self, model: Union[PreTrainedModel, nn.Module], inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, ): if ignore_keys is None: if hasattr(model, "config"): ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", []) else: ignore_keys = [] with torch.no_grad(): loss, metrics = self.get_batch_metrics(model, inputs, train_eval="eval") # force log the metrics if self.accelerator.is_main_process: self.store_metrics(metrics, train_eval="eval") if prediction_loss_only: return loss.detach(), None, None # logits for the chosen and rejected samples from model logits_dict = { "logits_test/chosen": metrics["eval_logits/chosen"], "logits_test/rejected": metrics["eval_logits/rejected"], } logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys) logits = torch.stack(logits).mean(axis=1) labels = torch.zeros(logits.shape[0]) return loss.detach(), logits, labels def store_metrics(self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None: for key, value in metrics.items(): self._stored_metrics[train_eval][key].append(value) def log(self, logs: Dict[str, float]) -> None: """ Log `logs` on the various objects watching training, including stored metrics. Args: logs (`Dict[str, float]`): The values to log. """ # logs either has 'loss' or 'eval_loss' train_eval = "train" if "loss" in logs else "eval" # Add averaged stored metrics to logs for key, metrics in self._stored_metrics[train_eval].items(): logs[key] = torch.tensor(metrics).mean().item() del self._stored_metrics[train_eval] return super().log(logs) ================================================ FILE: src/pretrain.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import argparse import torch import evaluate import json from tqdm import tqdm from transformers import ( Trainer, TrainingArguments, default_data_collator, GenerationConfig, ) from src.utils import RESOURCE_PATH, load_tokenizer_and_model, load_checkpoint from src.data.data import PretrainDataset, chatglm2_encode, chatglm3_encode from src.utils.file_utils import set_seed, print_rank_0 # from src.models.llama import LlamaForCausalLM # Create a preprocessing function to extract out the proper logits from the model output def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): logits = logits[0] return logits.argmax(dim=-1) def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--model_name_or_path", type=str, required=True) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--bits", type=int, default=32) parser.add_argument("--device_map", type=str, default=None, help="device map to allocate model," "[None] means cpu" "[0, 1, 2, ...], number means single-card" "[auto, balanced, balanced_low_0] means multi-card") parser.add_argument("--low_cpu_mem_usage", action="store_true", help="whether to enable low cpu memory usage" "when loading model") parser.add_argument("--max_length", type=int, default=1024) # train parser.add_argument("--do_train", action="store_true") parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--concat_samples", action="store_true") parser.add_argument("--num_epochs", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-6) parser.add_argument("--lr_scheduler_type", type=str, default="cosine", help="transformers.trainer_utils.SchedulerType, including:" "linear, cosine, cosine_with_restarts, polynomial, constant," "constant_with_warmup") parser.add_argument("--train_batch_size", type=int, default=4) parser.add_argument("--weight_decay", type=float, default=0.1) parser.add_argument("--warmup_ratio", type=int, default=0.1) parser.add_argument("--logging_steps", type=int, default=100) parser.add_argument("--save_strategy", type=str, default="steps", help='- `"no"`: No save is done during training.' '- `"epoch"`: Save is done at the end of each epoch.' '- `"steps"`: Save is done every `save_steps`.') parser.add_argument("--save_steps", type=int, default=1000) parser.add_argument("--save_total_limit", type=int, default=2) parser.add_argument("--metric_for_best_model", type=str, default=None) parser.add_argument("--gradient_accumulation_steps", type=int, default=8) parser.add_argument("--gradient_checkpointing", action="store_true", help="If True, use gradient checkpointing to save memory at the expense of slower backward pass.") parser.add_argument("--deepspeed_config", type=str, default=None) parser.add_argument("--lora_rank", type=int, default=0) parser.add_argument("--lora_alpha", type=int, default=16) parser.add_argument("--lora_train_bias", type=str, default="none") # eval parser.add_argument("--do_eval", action="store_true") parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--evaluation_strategy", type=str, default="steps", help='- `"no"`: No evaluation is done during training.' '- `"steps"`: Evaluation is done (and logged) every `eval_steps`.' '- `"epoch"`: Evaluation is done at the end of each epoch.') parser.add_argument("--eval_steps", type=int, default=100) parser.add_argument("--eval_accumulation_steps", type=int, default=1) # pred parser.add_argument("--do_pred", action="store_true") parser.add_argument("--max_length_generation", type=int, default=None) parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--test_filename", type=str, default=None) parser.add_argument("--output_filename", type=str, default=None) parser.add_argument("--data_types", type=str, default=None) parser.add_argument("--do_sample", action="store_true") parser.add_argument("--num_return_sequences", type=int, default=1) parser.add_argument("--top_k", type=int, default=10) parser.add_argument("--top_p", type=float, default=0.9) parser.add_argument("--temperature", type=float, default=0.8) parser.add_argument("--repetition_penalty", type=float, default=1.0) args = parser.parse_args() return args def main(): args = get_parser() print_rank_0(f"Parameters: {args}") set_seed(args.seed) # load tokenizer and model tokenizer, model, eos_token_id = load_tokenizer_and_model(args) if "baichuan2" in args.model_name_or_path.lower() or "qwen" in args.model_name_or_path.lower(): generation_config = GenerationConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True) generation_config.do_sample = args.do_sample generation_config.max_new_tokens = args.max_length_generation model.generation_config = generation_config if args.checkpoint is not None: load_checkpoint(args, model) print_rank_0(f"Finished loading model and tokenizer") # Set up the datasets if args.do_train: train_dataset = PretrainDataset(args, os.path.join(args.data_dir, args.train_filename), tokenizer, concat_samples=args.concat_samples) else: train_dataset = None if args.do_eval: dev_dataset = PretrainDataset(args, os.path.join(args.data_dir, args.eval_filename), tokenizer, concat_samples=False) else: dev_dataset = None # if args.do_pred: # test_dataset = PretrainDataset(args, os.path.join(args.data_dir, args.test_filename), # tokenizer) # else: # test_dataset = None if args.do_train: if torch.cuda.is_available(): bf16 = torch.cuda.get_device_capability()[0] >= 8 fp16 = not bf16 else: fp16 = False bf16 = False # training arguments deepspeed_config = os.path.join(RESOURCE_PATH, "config", "deepspeed", args.deepspeed_config) if args.deepspeed_config is not None else None training_args = TrainingArguments( output_dir=args.output_dir, no_cuda=not torch.cuda.is_available(), seed=args.seed, data_seed=args.seed, local_rank=args.local_rank, do_train=args.do_train, num_train_epochs=args.num_epochs, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, per_device_train_batch_size=args.train_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, warmup_ratio=args.warmup_ratio, weight_decay=args.weight_decay, half_precision_backend="auto", fp16=fp16, bf16=bf16, adam_beta1=0.9, adam_beta2=0.95, save_strategy=args.save_strategy, save_steps=args.save_steps, save_total_limit=args.save_total_limit, metric_for_best_model=args.metric_for_best_model, greater_is_better=True, logging_steps=args.logging_steps, report_to=["tensorboard"], deepspeed=deepspeed_config, gradient_checkpointing=args.gradient_checkpointing, do_eval=args.do_eval, evaluation_strategy=args.evaluation_strategy, eval_steps=args.eval_steps, eval_accumulation_steps=args.eval_accumulation_steps, per_device_eval_batch_size=args.eval_batch_size, # do_predict=args.do_pred, # use_legacy_prediction_loop=args.do_pred, ) print_rank_0(f"Training Arguments: {training_args}") # Set up the metric rouge = evaluate.load("rouge") def compute_metrics(eval_preds): labels_ids = eval_preds.label_ids pred_ids = eval_preds.predictions pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True) result = rouge.compute(predictions=pred_str, references=label_str) return result # Prepare the trainer and start training trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=dev_dataset, compute_metrics=compute_metrics, data_collator=default_data_collator, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) # model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train() trainer.save_model(args.output_dir) elif args.do_eval: # res = trainer.evaluate(eval_dataset=dev_dataset) # logger.info(res) pass if args.do_pred: device = f"cuda:{args.local_rank}" if torch.cuda.is_available() and args.device_map is not None else "cpu" model.eval() # tokenizer.padding_side = "left" if args.test_filename is not None: test_file = args.test_filename else: test_file = os.path.join(RESOURCE_PATH, "test_prompts.jsonl") data_types = args.data_types.split(",") if args.data_types is not None else None with torch.no_grad(): if args.output_filename is not None: w = open(os.path.join(args.output_dir, args.output_filename), "w", encoding="utf-8") else: w = None for line in tqdm(open(test_file, "r", encoding="utf-8"), desc="Prediction"): # for line in open(test_file, "r", encoding="utf-8"): test_data = json.loads(line.strip("\n")) data_type = test_data.get('data_type', None) if data_types is not None and data_type not in data_types: continue prompt = test_data['prompt'] prefix = test_data.get('prefix', None) system = test_data.get('system', "") label = test_data.get('label', None) # encoded_prompt = tokenizer(prompt) if "chatglm3" in args.model_name_or_path.lower(): _, _, prompt_ids = chatglm3_encode(tokenizer, query=prompt, label=None, system=system, max_length=args.max_length, is_prefix=True) input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=device) outputs = model.generate(input_ids=input_ids, max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) prompt_length = len(prompt_ids) results = tokenizer.batch_decode([output[prompt_length:] for output in outputs], skip_special_tokens=True) elif "chatglm2" in args.model_name_or_path.lower(): # results, history = model.chat(tokenizer, prompt, history=None, do_sample=False, # max_new_tokens=args.max_length_generation) # results = [results] _, _, prompt_ids = chatglm2_encode(tokenizer, query=prompt, label=None, system=system, max_length=args.max_length, is_prefix=True) input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=device) outputs = model.generate(input_ids=input_ids, max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) prompt_length = len(prompt_ids) results = tokenizer.batch_decode([output[prompt_length:] for output in outputs], skip_special_tokens=True) elif "chatglm" in args.model_name_or_path.lower(): inputs = tokenizer(prompt, max_length=args.max_length-args.max_length_generation, truncation="only_first", return_tensors="pt") inputs = inputs.to(device) outputs = model.generate(inputs=inputs['input_ids'], max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty) prompt_length = len(inputs['input_ids'][0]) results = tokenizer.batch_decode([output[prompt_length:] for output in outputs], skip_special_tokens=True) # elif "glm" in args.model_name_or_path.lower(): # encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token) # prompt_length = len(encoded_prompt['input_ids']) # encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token, # max_length=min(prompt_length, args.max_length), # truncation="only_first", # return_tensors="pt", # return_token_type_ids=False) # max_gen_length = args.max_length - encoded_dict['input_ids'].shape[1] # inputs = tokenizer.build_inputs_for_generation(encoded_dict, # max_gen_length=max_gen_length, padding=True) # inputs = inputs.to(device) # outputs = model.generate(**inputs, # max_new_tokens=min(args.max_length_generation, max_gen_length), # eos_token_id=tokenizer.eop_token_id, # pad_token_id=tokenizer.pad_token_id, # do_sample=args.do_sample, # num_return_sequences=args.num_return_sequences, # top_k=args.top_k, # top_p=args.top_p, # temperature=args.temperature) elif "baichuan2" in args.model_name_or_path.lower(): messages = [{"role": "user", "content": prompt}] results = model.chat(tokenizer, messages) results = [results] elif "qwen" in args.model_name_or_path.lower(): results, history = model.chat(tokenizer, prompt, history=None) results = [results] else: if prefix is not None and len(prefix) > 0: prompt += prefix inputs = tokenizer(prompt, max_length=args.max_length-args.max_length_generation, truncation="only_first", return_tensors="pt") inputs = inputs.to(device) outputs = model.generate(inputs=inputs['input_ids'], max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty) prompt_length = len(inputs['input_ids'][0]) results = tokenizer.batch_decode([output[prompt_length:] for output in outputs], skip_special_tokens=True) # p = tokenizer.decode(encoded_prompt['input_ids'], skip_special_tokens=True) answers = [] for r in results: # answer = r.replace(p, "").strip() answer = r # print_rank_0(f"\nprompt: {prompt}\nanswer: {answer}") answers.append({"answer": answer, "score": None}) if w is not None: w.write(json.dumps({"prompt": prompt, "prefix": prefix, "system": system, "answers": answers, "label": label}, ensure_ascii=False)+"\n") if w is not None: w.close() if __name__ == "__main__": main() ================================================ FILE: src/pretrain_wo_trainer.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import argparse import torch import evaluate import json import numpy as np import deepspeed from datetime import datetime from torch.utils.data import DistributedSampler, DataLoader from transformers.deepspeed import HfDeepSpeedConfig # from deepspeed.ops.adam import FusedAdam # from deepspeed.ops.adam import DeepSpeedCPUAdam from src.utils import logger, RESOURCE_PATH, load_tokenizer_and_model, load_checkpoint from src.data.data import PretrainDataset from src.utils.file_utils import set_seed, print_gpu_utilization, print_rank_0 from src.utils.modeling_utils import rotate_checkpoints, save_zero_three_model # from src.models import convert_to_lora_recursively # from src.models.llama import LlamaForCausalLM # Create a preprocessing function to extract out the proper logits from the model output def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): logits = logits[0] return logits.argmax(dim=-1) def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--model_name_or_path", type=str, required=True) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--max_length", type=int, default=1024) parser.add_argument("--max_length_generation", type=int, default=None) parser.add_argument("--bits", type=int, default=32) parser.add_argument("--device_map", type=str, default=None, help="device map to allocate model," "[None] means cpu" "[0, 1, 2, ...], number means single-card" "[auto, balanced, balanced_low_0] means multi-card") parser.add_argument("--low_cpu_mem_usage", action="store_true", help="whether to enable low cpu memory usage" "when loading model") # train parser.add_argument("--do_train", action="store_true") parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--concat_samples", action="store_true") parser.add_argument("--num_epochs", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-6) parser.add_argument("--lr_scheduler_type", type=str, default="OneCycle", help="deepspeed scheduler types, including:" "LRRangeTest, OneCycle, WarmupLR, WarmupDecayLR") parser.add_argument("--train_batch_size", type=int, default=4) parser.add_argument("--weight_decay", type=float, default=0.1) parser.add_argument("--warmup_steps", type=int, default=100) parser.add_argument("--logging_steps", type=int, default=100) parser.add_argument("--save_strategy", type=str, default="steps", help='- `"no"`: No save is done during training.' '- `"epoch"`: Save is done at the end of each epoch.' '- `"steps"`: Save is done every `save_steps`.') parser.add_argument("--save_steps", type=int, default=1000) parser.add_argument("--save_total_limit", type=int, default=2) parser.add_argument("--metric_for_best_model", type=str, default=None) parser.add_argument("--max_grad_norm", type=float, default=1.0) parser.add_argument("--gradient_accumulation_steps", type=int, default=8) parser.add_argument("--gradient_checkpointing", action="store_true", help="If True, use gradient checkpointing to save memory at the expense of slower backward pass.") parser.add_argument("--deepspeed_config", type=str, default=None) parser.add_argument("--lora_rank", type=int, default=0) parser.add_argument("--lora_alpha", type=int, default=16) parser.add_argument("--lora_train_bias", type=str, default="none") # eval parser.add_argument("--do_eval", action="store_true") parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--evaluation_strategy", type=str, default="steps", help='- `"no"`: No evaluation is done during training.' '- `"steps"`: Evaluation is done (and logged) every `eval_steps`.' '- `"epoch"`: Evaluation is done at the end of each epoch.') parser.add_argument("--eval_steps", type=int, default=100) parser.add_argument("--eval_accumulation_steps", type=int, default=1) # pred parser.add_argument("--do_pred", action="store_true") parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--test_filename", type=str, default=None) parser.add_argument("--output_filename", type=str, default=None) parser.add_argument("--do_sample", action="store_true") parser.add_argument("--num_return_sequences", type=int, default=1) parser.add_argument("--top_k", type=int, default=10) parser.add_argument("--top_p", type=float, default=0.9) parser.add_argument("--temperature", type=float, default=0.8) parser.add_argument("--repetition_penalty", type=float, default=1.0) args = parser.parse_args() return args def pred_single_sample(prompt, prefix, model, tokenizer, args, device, eos_token_id): max_prompt_length = args.max_length - args.max_length_generation if "chatglm" in args.model_name_or_path.lower(): encoded_prompt = tokenizer(prompt) prompt_length = len(encoded_prompt['input_ids']) inputs = tokenizer(prompt, max_length=min(prompt_length, args.max_length), truncation="only_first", return_tensors="pt") # max_gen_length = args.max_length - encoded_dict['input_ids'].shape[1] # inputs = tokenizer.build_inputs_for_generation(encoded_dict, # max_gen_length=max_gen_length, padding=True) input_ids = inputs['input_ids'] inputs = inputs.to(device) outputs = model.generate(inputs=inputs['input_ids'], max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) # elif "glm" in args.model_name_or_path.lower(): # encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token) # prompt_length = len(encoded_prompt['input_ids']) # encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token, # max_length=min(prompt_length, args.max_length), # truncation="only_first", # return_tensors="pt", # return_token_type_ids=False) # input_ids = encoded_dict['input_ids'] # max_gen_length = args.max_length - encoded_dict['input_ids'].shape[1] # inputs = tokenizer.build_inputs_for_generation(encoded_dict, # max_gen_length=max_gen_length, padding=True) # inputs = inputs.to(device) # outputs = model.generate(inputs=inputs['input_ids'], # max_new_tokens=min(args.max_length_generation, max_gen_length), # eos_token_id=tokenizer.eop_token_id, # pad_token_id=tokenizer.pad_token_id, # do_sample=args.do_sample, # num_return_sequences=args.num_return_sequences, # top_k=args.top_k, # top_p=args.top_p, # temperature=args.temperature) else: inputs = tokenizer(prompt, max_length=max_prompt_length, truncation="longest_first", return_tensors="pt") input_ids = inputs['input_ids'] inputs = inputs.to(device) outputs = model.generate(inputs=inputs['input_ids'], max_new_tokens=args.max_length_generation, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty) results = tokenizer.batch_decode(outputs, skip_special_tokens=True) p = tokenizer.decode(input_ids, skip_special_tokens=True) results = [result.replace(p, "").strip() for result in results] answers = [] for r in results: print_rank_0(f"\nprompt: {p}\nanswer: {r}") answers.append({"answer": r, "score": None}) d = {"prompt": prompt, "prefix": prefix, "answers": answers} return d def pred(args, model, tokenizer, device, eos_token_id, step=-1): print_rank_0(f"Prediction Result@{step}") with torch.no_grad(): with open(os.path.join(args.output_dir, args.output_filename.format(step=step)), "w", encoding="utf-8") as w: with open(os.path.join(args.data_dir, args.test_filename), "r", encoding="utf-8") as r: while True: line = r.readline() if not line: break item = json.loads(line.strip("\n")) prompt = item['context'] result = pred_single_sample(prompt, "", model, tokenizer, args, device, eos_token_id) if args.local_rank <= 0: w.write(json.dumps(result, ensure_ascii=False)+"\n") def main(): args = get_parser() print_rank_0(f"Parameters: {args}") set_seed(args.seed) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # load quantization config if torch.cuda.is_available(): bf16 = torch.cuda.get_device_capability()[0] >= 8 fp16 = not bf16 else: fp16 = False bf16 = False # create HfDeepSpeedConfig [must be called before instantiating model] if args.deepspeed_config is not None: ds_config_filename = os.path.join(RESOURCE_PATH, "config", "deepspeed", args.deepspeed_config) ds_config = json.load(open(ds_config_filename, "r", encoding="utf-8")) # ds_config["steps_per_print"] = args.logging_steps ds_config["train_micro_batch_size_per_gpu"] = args.train_batch_size ds_config["gradient_accumulation_steps"] = args.gradient_accumulation_steps ds_config["gradient_clipping"] = args.max_grad_norm # TODO: before calling dist init, world size is always 1, therefore ds_config['train_batch_size'] cannot multiply world size ds_config['train_batch_size'] = args.train_batch_size * args.gradient_accumulation_steps #* torch.cuda.device_count() # TODO: assuming hidden_size=4096 ds_config["zero_optimization"]["reduce_bucket_size"] = 4096 * 4096 ds_config["zero_optimization"]["stage3_prefetch_bucket_size"] = 0.9 * 4096 * 4096 ds_config["zero_optimization"]["stage3_param_persistence_threshold"] = 10 * 4096 ds_config["fp16"]["enabled"] = fp16 ds_config["bf16"]["enabled"] = bf16 ds_config["optimizer"]["params"] = { "lr": args.learning_rate, "betas": [0.9, 0.95], "eps": 1e-8, "weight_decay": args.weight_decay } assert ds_config["scheduler"]['type'] == args.lr_scheduler_type ds_config["scheduler"]["params"] = { "cycle_min_lr": 0, "cycle_max_lr": args.learning_rate, "cycle_first_step_size": args.warmup_steps } current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') ds_config['tensorboard']['job_name'] = f"deepspeed-{current_time}" dschf = HfDeepSpeedConfig(ds_config) # keep this object alive # load tokenizer and model tokenizer, model, eos_token_id = load_tokenizer_and_model(args, with_trainer=False) print_gpu_utilization("after from_pretrained()", args.local_rank) if args.checkpoint is not None: load_checkpoint(args, model) print_rank_0(f"Finished loading model and tokenizer") # Set up the datasets if args.do_train: train_dataset = PretrainDataset(args, os.path.join(args.data_dir, args.train_filename), tokenizer, concat_samples=args.concat_samples) else: train_dataset = None if args.do_eval: eval_dataset = PretrainDataset(args, os.path.join(args.data_dir, args.eval_filename), tokenizer, concat_samples=False) # Set up the metric rouge = evaluate.load("rouge") def compute_metrics(pred_ids, label_ids): pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True) result = rouge.compute(predictions=pred_str, references=label_str) return result else: eval_dataset = None if args.do_train: # # Optimizer # AdamOptimizer = DeepSpeedCPUAdam if "3" in args.deepspeed_config else FusedAdam # optim_params = get_optimizer_grouped_parameters( # actor_model, self.args.actor_weight_decay) # optim = AdamOptimizer(optim_params, # lr=self.args.actor_learning_rate, # betas=(0.9, 0.95)) # # # LR Scheduler # lr_scheduler = get_scheduler( # name=self.args.lr_scheduler_type, # optimizer=optim, # num_warmup_steps=self.args.warmup_steps, # num_training_steps=self.num_total_iters, # ) # deepspeed initialize ds_config['train_batch_size'] = args.train_batch_size * args.gradient_accumulation_steps * torch.cuda.device_count() model_engine, *_ = deepspeed.initialize(model=model, # optimizer=optim, # lr_scheduler=lr_scheduler, config=ds_config) print_gpu_utilization("after deepspeed.initialize()", args.local_rank) # create data loader train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, # collate_fn=data_collator, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_sampler = DistributedSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, # collate_fn=data_collator, sampler=eval_sampler, batch_size=args.eval_batch_size) def eval(step): model_engine.eval() eval_results = dict() with torch.no_grad(): for eval_batch in eval_dataloader: eval_batch = {k: v.to(device) for k, v in eval_batch.items()} eval_output = model_engine(**eval_batch) pred_ids = preprocess_logits_for_metrics(eval_output.logits, None) result_rouge = compute_metrics(pred_ids, eval_batch['labels']) for k, v in result_rouge.items(): key = f"eval_{k}" if key not in eval_results: eval_results[key] = [] eval_results[key].append(v) if "eval_loss" not in eval_results: eval_results['eval_loss'] = [] eval_results['eval_loss'].append(eval_output.loss.tolist()) if args.do_pred: pred(args, model_engine, tokenizer, device, eos_token_id, step) model_engine.train() for k, v in eval_results.items(): eval_results[k] = np.mean(eval_results[k]) return eval_results # training model_engine.train() if args.gradient_checkpointing: model_engine.module.gradient_checkpointing_enable() print_gpu_utilization("before training begin", args.local_rank) global_step = 0 best_metric = None best_model_checkpoint = None if args.do_eval: assert args.eval_steps <= args.save_steps and args.save_steps % args.eval_steps == 0, \ f"save steps should be greater than eval steps and be a multiple of eval steps" eval_results = eval(global_step) print_rank_0(f"Epoch-0, Gloal step-{global_step}, Evaluation result: {eval_results}") if args.metric_for_best_model is not None: assert args.metric_for_best_model in eval_results, \ f"{args.metric_for_best_model} is not a valid metric, " \ f"please choose from the following metrics: {eval_results.keys()}" for epoch in range(args.num_epochs): print_rank_0(f"Beginning of Epoch {epoch+1}/{args.num_epochs}") for step, batch in enumerate(train_dataloader): batch = {k: v.to(device) for k, v in batch.items()} logger.debug(f"batch keys: {batch.keys()}") output = model_engine(**batch) model_engine.backward(output.loss) model_engine.step() global_step += 1 if global_step % args.logging_steps == 0: print_rank_0(f"Epoch-{epoch+1}, Gloal step-{global_step}, loss: {output.loss}") if args.do_eval and global_step % args.eval_steps == 0: eval_results = eval(global_step) print_rank_0(f"Epoch-{epoch+1}, Gloal step-{global_step}, Evaluation result: {eval_results}") if global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}") if args.do_eval and args.metric_for_best_model is not None: if ( best_metric is None or best_model_checkpoint is None or eval_results[args.metric_for_best_model] > best_metric ): best_metric = eval_results[args.metric_for_best_model] best_model_checkpoint = output_dir rotate_checkpoints(args.save_total_limit, use_mtime=True, output_dir=args.output_dir, best_model_checkpoint=best_model_checkpoint) # save_zero_three_model(model_engine, args.local_rank, # save_dir=output_dir, # zero_stage=ds_config['zero_optimization']['stage']) # model_engine.save_16bit_model(output_dir) model_engine.save_checkpoint(args.output_dir, f"checkpoint-{global_step}") print_rank_0(f"Finished saving checkpoint @Step-{global_step}") print_rank_0(f"Finished training! epochs: {epoch+1}, steps: {global_step}") output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}") # save_zero_three_model(model_engine, args.local_rank, # save_dir=output_dir, # zero_stage=ds_config['zero_optimization']['stage']) # model_engine.save_16bit_model(output_dir) model_engine.save_checkpoint(args.output_dir, f"checkpoint-{global_step}") print_rank_0(f"Finished saving checkpoint @Step-{global_step}") elif args.do_eval: pass if args.do_pred: model.eval() device = f"cuda:{args.local_rank}" if torch.cuda.is_available() else "cpu" model = model.to(device) # tokenizer.padding_side = "left" pred(args, model, tokenizer, device, eos_token_id) if __name__ == "__main__": main() ================================================ FILE: src/resources/config/deepspeed/Experiment.md ================================================ 为验证不同预训练模型使用deepspeed的训练效率是否能达到官方宣称的效果(加速、节省GPU等),进行了benchmarking - 实验场景:SFT阶段训练 - 实验数据:SFT & Reward Data的验证集,共1万条样本 - 实验参数:```batch_size=1, max_sequence_length=512, gradient_checkpointing=False```
模型 整体耗时/epoch 单条样本耗时 内存使用量 显存使用量 GPU型号和数量 fp16 bf16 deepspeed stage offload optimizer pin memory offloard param overlap comm allgather bucket size stage3 max live parameters
Pangu-350M 20min 1.17s/it 1*8750MB 1*V100 16G false - - - - - - - -
19min 1.03s/it 1*9010MB 1*V100 16G true - - - - - - - -
7.5min 1.10s/it 3*9406MB 3*V100 16G true - 0 - - - - - -
6.5min 1.05s/it 3*5674MB 3*V100 16G true - 1 - - - - - -
14min 2.10s/it 3*6262MB 3*V100 16G true - 2 false - - false 5e8 -
18min 2.6s/it 18G 3*3668MB 3*V100 16G true - 2 true true - false 2e8 -
18.5min 2.65s/it 18G 3*4240MB 3*V100 16G true - 2 true true - false 5e8 -
18.5min 2.65s/it 18G 3*5194MB 3*V100 16G true - 2 true true - false 1e9 -
19min 2.80s/it 18G 3*12824MB 3*V100 16G true - 2 true true - false 5e9 -
47min 6.75s/it 14G 3*4914MB 3*V100 16G true - 3 false - false false 1e9 -
3.3h 29s/it 18G 3*3512MB 3*V100 16G true - 3 true true false false 1e9 -
4h 34s/it 24G 3*3466MB 3*V100 16G true - 3 true true true false 1e9 -
3.8h 33s/it 24G 3*3746MB 3*V100 16G true - 3 true true true true 1e9 -
3.8h 33s/it 24G 3*3594MB 3*V100 16G true - 3 true true true true 5e8 -
4h 35s/it 24G 3*3526MB 3*V100 16G true - 3 true true true true 2e8 -
Pangu-2.6B - - - OOM 1*V100 16G false - - - - - - - -
- - - OOM 1*V100 16G true - - - - - - - -
- - - OOM 3*V100 16G true - 0 - - - - - -
- - - OOM 3*V100 16G true - 1 - - - - - -
- - - OOM 3*V100 16G true - 2 false - - false 5e8 -
1.5h 12.3s/it 59G 3*10796MB 3*V100 16G true - 2 true true - false 2e8 -
1.3h 12s/it 59G 3*11368MB 3*V100 16G true - 2 true true - false 5e8 -
- - - OOM 3*V100 16G true - 3 false - false false - 1e9
7.5h 64.5s/it 58G 3*13428MB 3*V100 16G true - 3 true true false false - 1e9
11.3h 95s/it 109G 3*12170MB 3*V100 16G true - 3 true true true false - 1e9
Pangu-2.6B 1.32s/it 1*49347MB 1*A100 80G false false - - - - - - -
1.27s/it 1*52783MB 1*A100 80G true false - - - - - - -
1.27s/it 1*52783MB 1*A100 80G false true - - - - - - -
- CUDA Error 1*A100 80G false true 0 - - - - - -
- CUDA Error 1*A100 80G false true 1 - - - - - -
- CUDA Error 1*A100 80G false true 2 false - - false 5e8 -
9-11s/it 1*12537MB 1*A100 80G false true 2 true true - false 2e8 -
8-9s/it 1*13539MB 1*A100 80G false true 2 true true - false 5e8 -
9-11s/it 1*15041MB 1*A100 80G false true 2 true true - false 1e9 -
8-9s/it 1*14887MB 1*A100 80G false true 2 true true - true 5e8 -
- CUDA Error 1*A100 80G false true 3 false - false false 5e8 1e9
17-18s/it 1*16935MB 1*A100 80G false true 3 true true false false - 1e9
20-21s/it 1*12219MB 1*A100 80G false true 3 true true true false - 1e9
19-20s/it 1*15981MB 1*A100 80G false true 3 true true true true - 1e9
22-23s/it 1*12023MB 1*A100 80G false true 3 true true true true - 5e8
20-21s/it 1*12023MB 1*A100 80G false true 3 true true true true - 2e8
================================================ FILE: src/resources/config/deepspeed/README.md ================================================ # Deepspeed Configuration - deepspeed==0.8.2 - transformers==4.26.1 - using ```transformers.Trainer``` and ```transformers.TrainingArguments``` Example of deepspeed config with key items explained: ```bash { "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 100, "wall_clock_breakdown": false # "Enable timing of the latency of forward/backward/update training phases" "optimizer": { "type": "Adam", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } }, "scheduler": { "type": "WarmupLR", "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto" } }, "fp16": { "enabled": "auto", "auto_cast": false, # automatically casts inputs to fp16 "loss_scale": 0, # a fp16 parameter representing the loss scaling value for FP16 training. The default value of 0.0 results in dynamic loss scaling, otherwise the value will be used for static fixed loss scaling "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, # "BFLOAT16 requires hardware support (e.g., NVIDIA A100). Training with bfloat16 does not require loss scaling" "bf16": { "enabled": "auto" }, "zero_optimization": { "stage": [0|1|2|3], # "Stage 0, 1, 2, and 3 refer to disabled, optimizer state partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning, respectively." "offload_optimizer": { "device": "[cpu|nvme]", "pin_memory": true, # "This feature can improve the throughput at the cost of making less memory available to other processes. Pinned memory is set aside to the specific process that requested it and its typically accessed much faster than normal CPU memory" # all nvme-related params "nvme_path": "/local_nvme", "buffer_count": 4, "fast_init": false }, "offload_param": { "device": "[cpu|nvme]", "pin_memory": true, # "This feature can improve the throughput at the cost of making less memory available to other processes. Pinned memory is set aside to the specific process that requested it and its typically accessed much faster than normal CPU memory" # all nvme-related params "nvme_path": "/local_nvme", "buffer_count": 5, "buffer_size": 1e8, "max_in_cpu": 1e9 }, "overlap_comm": false, # "if set to true, trades off increased GPU RAM usage to lower all-reduce latency. overlap_comm uses 4.5x the allgather_bucket_size and reduce_bucket_size values. So if they are set to 5e8, this requires a 9GB footprint (5e8 x 2Bytes x 2 x 4.5). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting OOM-errors you will need to reduce those parameters to about 2e8, which would require 3.6GB" "reduce_bucket_size": "auto", # "Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes. When set auto, it equals hidden_size*hidden_size" # only stage-2 params "allgather_partitions": true, "allgather_bucket_size": 5e8, # "Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes" "reduce_scatter": true, "contiguous_gradients" : true, # "Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass" "round_robin_gradients": [true|false], # "Stage 1 and 2 optimization for CPU offloading that parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism)." # only stage-3 params "stage3_max_live_parameters" : 1e9, # "The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication. 1e9 would consume ~2GB" "stage3_max_reuse_distance" : 1e9, # "Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but perform more communication." "stage3_prefetch_bucket_size" : "auto", # "The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase stalls due to communication. When set auto, it equals 0.9 * hidden_size * hidden_size" "stage3_param_persistence_threshold" : "auto", # "Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). When set auto, it equals 10 * hidden_size" "sub_group_size" : 1e12, # controls the granularity in which parameters are updated during optimizer steps. Parameters are grouped into buckets of sub_group_size and each buckets is updated one at a time. When used with NVMe offload in ZeRO-Infinity, sub_group_size therefore controls the granularity in which model states are moved in and out of CPU memory from NVMe during the optimizer step. This prevents running out of CPU memory for extremely large models. "elastic_checkpoint" : [true|false], "stage3_gather_16bit_weights_on_model_save": true, # Consolidate the weights before saving the model by save_16bit_model(). Since the weights are partitioned across GPUs, they aren’t part of state_dict, so this function automatically gathers the weights when this option is enabled and then saves the fp16 model weights. "ignore_unused_parameters": true # Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks. This controls whether or not training should terminate with an error message when unused parameters are detected }, # DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed "autotuning": {}, # Flops Profiler helps users easily measure both the model training/inference speed (latency, throughput) and efficiency (floating-point operations per second, i.e., FLOPS) of a model and its submodules "flops_profiler": {}, "activation_checkpointing": {}, "sparse_attention": {}, # DeepSpeed Data Efficiency Library includes two techniques: curriculum learning and random layerwise token dropping (random-LTD). "data_efficiency": {}, # Compression has seven different components, including layer reduction, weight quantization, activation quantization, sparse pruning, row pruning, head pruning, and channel pruning "compression_training": {} } ``` Note that the speical value ```auto``` in the configuration will be automatically replaced with the correct or most efficient value from ```transformers.TrainingArguments``` ## How to Choose Which ZeRO Stage and Offloads To Use For Best Performance ### Guideline - Speed-wise (left is faster than right) Stage 0 (DDP) > Stage 1 > Stage 2 > Stage 2 + offload > Stage 3 > Stage 3 + offloads - GPU Memory usage-wise (right is more GPU memory efficient than left) Stage 0 (DDP) < Stage 1 < Stage 2 < Stage 2 + offload < Stage 3 < Stage 3 + offloads ### Memory requirements Since Deepspeed ZeRO can offload memory to CPU (and NVMe) the framework provides utils that allow one to tell how much CPU and GPU memory will be needed depending on the number of GPUs being used. Using "bigscience/T0_3B" and one GPU as example: ```python from transformers import AutoModel from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live model = AutoModel.from_pretrained("bigscience/T0_3B") # stage 1 and 2 estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1) # stage 3 estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1) ``` ### Steps to find the optimal configuration So when you want to get the fastest execution while fitting into minimal number of GPUs, here is the process you could follow. We start with the fastest approach and if running into GPU OOM we then go to the next slower approach, but which will use less GPU memory. And so on and so forth. First of all set batch size to 1 (you can always use gradient accumulation for any desired effective batch size). 1. Enable ```--gradient_checkpointing 1``` (HF Trainer) or directly ```model.gradient_checkpointing_enable()``` - if OOM then 2. Try ZeRO stage 2 first. if OOM then 3. Try ZeRO stage 2 + ```offload_optimizer``` - if OOM then 4. Switch to ZeRO stage 3 - if OOM then 5. Enable ```offload_param``` to ```cpu``` - if OOM then 6. Enable ```offload_optimizer``` to ```cpu``` - if OOM then 7. If you still can’t fit a batch size of 1 first check various default values and lower them if you can. For example, if you use ```generate``` and you don’t use a wide search beam make it narrower as it’d take a lot of memory. 8. Definitely use mixed half-precision over fp32 - so bf16 on Ampere and higher GPUs and fp16 on older gpu architectures. 9. If you still OOM you could add more hardware or enable ZeRO-Infinity - that is switch offloads ```offload_param``` and ```offload_optimizer``` to ```nvme```. You need to make sure it’s a very fast nvme. You can, of course, work through these steps in reverse by starting with the most GPU memory efficient config and then going backwards. Or try bi-secting it. ## Tricks & Troubleshooting - If you are training from scratch, try to have tensors with shapes that are divisible by 16 (e.g. hidden size). For batch size try divisible by 2 at least. There are wave and tile quanitization divisibility that is hardware-specific if you want to squeeze even higher performance from your GPUs. - It’s possible to adjust ZeRO-3 configuration to make it perform closer to ZeRO-2: - set ```stage3_param_persistence_threshold``` to a very large number - larger than the largest parameter, e.g., ```6 * hidden_size * hidden_size```. This will keep the parameters on the GPUs. - turn off ```offload_params``` since ZeRO-2 doesn’t have that option. - ```overlap_comm```: if true, trades off increased GPU RAM usage to lower all-reduce latency. ```overlap_comm``` uses 4.5x the ```allgather_bucket_size``` and ```reduce_bucket_size``` values. So if they are set to 5e8, this requires a 9GB footprint (5e8 x 2Bytes x 2 x 4.5). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting OOM-errors you will need to reduce those parameters to about 2e8, which would require 3.6GB. You will want to do the same on larger capacity GPU as well, if you’re starting to hit OOM - ```sub_group_size```: You can leave it to default value when not using NVMe offload. You may want to change its default value in the following cases: - Running into OOM during optimizer step: Reduce sub_group_size to reduce memory utilization of temporary buffers; - Optimizer Step is taking a long time: Increase sub_group_size to improve bandwidth utilization as a result of the increased data buffers. - ```activation_checkpointing```: activation checkpointing and gradient checkpointing refer to the same methodology. But enabling ```activation_checkpointing``` in deepSpeed config has no effect on huggingface transformers. If you want to use a HF Transformers models you can do model.gradient_checkpointing_enable() or use --gradient_checkpointing in the HF Trainer, which will automatically enable this for you. - Using fp16 and you see in your log that Deepspeed reports OVERFLOW! as follows: ```bash [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144 ``` that means that the Deepspeed loss scaler can’t figure out a scaling co-efficient that overcomes loss overflow. In this case you usually need to raise the value of ```initial_scale_power``` to 32 which will typically solve the problem. # Main Resources - [Huggingface Deepspeed Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed#deepspeed-integration) - [Deepspeed Docs](https://www.deepspeed.ai/) - [Deepspeed Github](https://github.com/microsoft/DeepSpeed) ================================================ FILE: src/resources/config/deepspeed/stage-1.json ================================================ { "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 100, "wall_clock_breakdown": false, "zero_optimization": { "stage": 1 }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled": "auto" }, "optimizer": { "type": "Adam", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } } } ================================================ FILE: src/resources/config/deepspeed/stage-2.json ================================================ { "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 100, "wall_clock_breakdown": false, "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "overlap_comm": false, "allgather_partitions": true, "allgather_bucket_size": 5e8, "reduce_scatter": true, "reduce_bucket_size": 5e8, "contiguous_gradients" : true, "round_robin_gradients": true }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled": "auto" }, "optimizer": { "type": "Adam", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } } } ================================================ FILE: src/resources/config/deepspeed/stage-3-no_trainer.json ================================================ { "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 10, "wall_clock_breakdown": false, "zero_optimization": { "stage": 3, "offload_optimizer": { "device": "cpu", "pin_memory": false }, "offload_param": { "device": "cpu", "pin_memory": false }, "overlap_comm": false, "stage3_max_live_parameters" : 1e3, "stage3_max_reuse_distance" : 1e3, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size" : "auto", "stage3_param_persistence_threshold" : "auto", "sub_group_size" : 1e3, "stage3_gather_16bit_weights_on_model_save": true, "ignore_unused_parameters": true }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled": "auto" }, "optimizer": { "type": "Adam", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } }, "scheduler": { "type": "OneCycle", "params": { "cycle_min_lr": 0, "cycle_max_lr": "auto", "cycle_first_step_size": "auto", "cycle_momentum": true, "cycle_min_mom": 0.85, "cycle_max_mom": 0.99 } }, "tensorboard": { "enabled": true, "output_path": "out/tensorboard/" } } ================================================ FILE: src/resources/config/deepspeed/stage-3.json ================================================ { "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 10, "wall_clock_breakdown": false, "zero_optimization": { "stage": 3, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "offload_param": { "device": "cpu", "pin_memory": true }, "overlap_comm": false, "stage3_max_live_parameters" : 1e3, "stage3_max_reuse_distance" : 1e3, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size" : "auto", "stage3_param_persistence_threshold" : "auto", "sub_group_size" : 1e3, "stage3_gather_16bit_weights_on_model_save": true, "ignore_unused_parameters": true }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled": "auto" }, "optimizer": { "type": "Adam", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } } } ================================================ FILE: src/resources/config/log.conf ================================================ [loggers] keys=root,main [handlers] keys=consoleHanlder,fileHandler [formatters] keys=fmt [logger_root] level=DEBUG handlers=consoleHanlder,fileHandler [logger_main] level=DEBUG qualname=main handlers=fileHandler [handler_consoleHanlder] class=StreamHandler level=INFO formatter=fmt args=(sys.stdout,) [handler_fileHandler] class=logging.handlers.TimedRotatingFileHandler level=INFO formatter=fmt args=('output.log', 'd', 1, 7) [formatter_fmt] format=%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)s] %(message)s datefmt=%Y/%m/%d %H:%M:%S ================================================ FILE: src/resources/config/ppo_model/ppo_config_glm-350M-chinese.yml ================================================ train: seq_length: 512 epochs: 1 total_steps: 50000 batch_size: 1 checkpoint_interval: 1000 eval_interval: 1000 pipeline: "GLMPipeline" trainer: "AcceleratePPOTrainer" tracker: "tensorboard" logging_dir: "tensorboard" model: model_path: "/root/autodl-tmp/Data/models/glm-350M-chinese" num_layers_unfrozen: -1 model_arch_type: "seq2seq" tokenizer: tokenizer_path: "/root/autodl-tmp/Data/models/glm-350M-chinese" padding_side: "right" optimizer: name: "adamw" kwargs: lr: 5.0e-6 betas: [0.9, 0.999] eps: 1.0e-8 weight_decay: 0.01 scheduler: name: "cosine_annealing" kwargs: T_max: 100000 eta_min: 5.0e-6 method: name: "ppoconfig" num_rollouts: 512 chunk_size: 1 ppo_epochs: 1 init_kl_coef: 0.1 target: 6 horizon: 10000 gamma: 1 lam: 0.95 cliprange: 0.2 cliprange_value: 0.2 vf_coef: 0.2 scale_reward: False ref_mean: null ref_std: null cliprange_reward: 10 gen_kwargs: max_new_tokens: 100 top_k: 50 top_p: 0.8 do_sample: True temperature: 1.0 ================================================ FILE: src/resources/config/ppo_model/ppo_config_glm-small.yml ================================================ train: seq_length: 256 epochs: 1 total_steps: 10000 batch_size: 2 checkpoint_interval: 100 eval_interval: 100 pipeline: "GLMPipeline" trainer: "AcceleratePPOTrainer" tracker: "tensorboard" logging_dir: "tensorboard" model: model_path: "/root/autodl-tmp/Data/models/glm-350M-chinese" num_layers_unfrozen: -1 model_arch_type: "seq2seq" tokenizer: tokenizer_path: "/root/autodl-tmp/Data/models/glm-350M-chinese" padding_side: "left" optimizer: name: "adamw" kwargs: lr: 5.0e-6 betas: [0.9, 0.999] eps: 1.0e-8 weight_decay: 0.01 scheduler: name: "cosine_annealing" kwargs: T_max: 100000 eta_min: 5.0e-6 method: name: "ppoconfig" num_rollouts: 2 chunk_size: 1 ppo_epochs: 1 init_kl_coef: 0.1 target: 6 horizon: 10000 gamma: 1 lam: 0.95 cliprange: 0.2 cliprange_value: 0.2 vf_coef: 0.2 scale_reward: False ref_mean: null ref_std: null cliprange_reward: 10 gen_kwargs: max_new_tokens: 5 top_k: 50 top_p: 0.8 do_sample: True temperature: 1.0 ================================================ FILE: src/resources/config/ppo_model/ppo_config_pangu-2.6B.yml ================================================ train: seq_length: 512 epochs: 1 total_steps: 50000 batch_size: 4 checkpoint_interval: 1000 eval_interval: 1000 pipeline: "PanguPipeline" trainer: "AcceleratePPOTrainer" tracker: "tensorboard" logging_dir: "tensorboard" model: model_path: "/root/autodl-tmp/Data/models/pangu-2.6B" num_layers_unfrozen: -1 tokenizer: tokenizer_path: "/root/autodl-tmp/Data/models/pangu-2.6B" padding_side: "left" optimizer: name: "adamw" kwargs: lr: 5.0e-6 betas: [0.9, 0.999] eps: 1.0e-8 weight_decay: 0.01 scheduler: name: "cosine_annealing" kwargs: T_max: 100000 eta_min: 5.0e-6 method: name: "ppoconfig" num_rollouts: 512 chunk_size: 4 ppo_epochs: 1 init_kl_coef: 0.1 target: 6 horizon: 10000 gamma: 1 lam: 0.95 cliprange: 0.2 cliprange_value: 0.2 vf_coef: 0.2 scale_reward: False ref_mean: null ref_std: null cliprange_reward: 10 gen_kwargs: max_new_tokens: 100 top_k: 50 top_p: 0.8 do_sample: True temperature: 1.0 ================================================ FILE: src/resources/config/ppo_model/ppo_config_pangu-350M.yml ================================================ train: seq_length: 512 epochs: 1 total_steps: 100000 batch_size: 16 checkpoint_interval: 1000 eval_interval: 1000 pipeline: "PanguPipeline" trainer: "AcceleratePPOTrainer" tracker: "tensorboard" logging_dir: "tensorboard" model: model_path: "/root/autodl-tmp/Data/models/pangu-350M" num_layers_unfrozen: -1 tokenizer: tokenizer_path: "/root/autodl-tmp/Data/models/pangu-350M" padding_side: "left" optimizer: name: "adamw" kwargs: lr: 5.0e-6 betas: [0.9, 0.999] eps: 1.0e-8 weight_decay: 0.01 scheduler: name: "cosine_annealing" kwargs: T_max: 100000 eta_min: 5.0e-6 method: name: "ppoconfig" num_rollouts: 512 chunk_size: 16 ppo_epochs: 1 init_kl_coef: 0.1 target: 6 horizon: 10000 gamma: 1 lam: 0.95 cliprange: 0.2 cliprange_value: 0.2 vf_coef: 0.2 scale_reward: False ref_mean: null ref_std: null cliprange_reward: 10 gen_kwargs: max_new_tokens: 100 top_k: 50 top_p: 0.8 do_sample: True temperature: 1.0 ================================================ FILE: src/resources/dict/jieba_custom_dict.txt ================================================ ================================================ FILE: src/resources/eval/ceval/subject_mapping.json ================================================ { "computer_network": [ "Computer Network", "计算机网络", "STEM" ], "operating_system": [ "Operating System", "操作系统", "STEM" ], "computer_architecture": [ "Computer Architecture", "计算机组成", "STEM" ], "college_programming": [ "College Programming", "大学编程", "STEM" ], "college_physics": [ "College Physics", "大学物理", "STEM" ], "college_chemistry": [ "College Chemistry", "大学化学", "STEM" ], "advanced_mathematics": [ "Advanced Mathematics", "高等数学", "STEM" ], "probability_and_statistics": [ "Probability and Statistics", "概率统计", "STEM" ], "discrete_mathematics": [ "Discrete Mathematics", "离散数学", "STEM" ], "electrical_engineer": [ "Electrical Engineer", "注册电气工程师", "STEM" ], "metrology_engineer": [ "Metrology Engineer", "注册计量师", "STEM" ], "high_school_mathematics": [ "High School Mathematics", "高中数学", "STEM" ], "high_school_physics": [ "High School Physics", "高中物理", "STEM" ], "high_school_chemistry": [ "High School Chemistry", "高中化学", "STEM" ], "high_school_biology": [ "High School Biology", "高中生物", "STEM" ], "middle_school_mathematics": [ "Middle School Mathematics", "初中数学", "STEM" ], "middle_school_biology": [ "Middle School Biology", "初中生物", "STEM" ], "middle_school_physics": [ "Middle School Physics", "初中物理", "STEM" ], "middle_school_chemistry": [ "Middle School Chemistry", "初中化学", "STEM" ], "veterinary_medicine": [ "Veterinary Medicine", "兽医学", "STEM" ], "college_economics": [ "College Economics", "大学经济学", "Social Science" ], "business_administration": [ "Business Administration", "工商管理", "Social Science" ], "marxism": [ "Marxism", "马克思主义基本原理", "Social Science" ], "mao_zedong_thought": [ "Mao Zedong Thought", "毛泽东思想和中国特色社会主义理论体系概论", "Social Science" ], "education_science": [ "Education Science", "教育学", "Social Science" ], "teacher_qualification": [ "Teacher Qualification", "教师资格", "Social Science" ], "high_school_politics": [ "High School Politics", "高中政治", "Social Science" ], "high_school_geography": [ "High School Geography", "高中地理", "Social Science" ], "middle_school_politics": [ "Middle School Politics", "初中政治", "Social Science" ], "middle_school_geography": [ "Middle School Geography", "初中地理", "Social Science" ], "modern_chinese_history": [ "Modern Chinese History", "近代史纲要", "Humanities" ], "ideological_and_moral_cultivation": [ "Ideological and Moral Cultivation", "思想道德修养与法律基础", "Humanities" ], "logic": [ "Logic", "逻辑学", "Humanities" ], "law": [ "Law", "法学", "Humanities" ], "chinese_language_and_literature": [ "Chinese Language and Literature", "中国语言文学", "Humanities" ], "art_studies": [ "Art Studies", "艺术学", "Humanities" ], "professional_tour_guide": [ "Professional Tour Guide", "导游资格", "Humanities" ], "legal_professional": [ "Legal Professional", "法律职业资格", "Humanities" ], "high_school_chinese": [ "High School Chinese", "高中语文", "Humanities" ], "high_school_history": [ "High School History", "高中历史", "Humanities" ], "middle_school_history": [ "Middle School History", "初中历史", "Humanities" ], "civil_servant": [ "Civil Servant", "公务员", "Other" ], "sports_science": [ "Sports Science", "体育学", "Other" ], "plant_protection": [ "Plant Protection", "植物保护", "Other" ], "basic_medicine": [ "Basic Medicine", "基础医学", "Other" ], "clinical_medicine": [ "Clinical Medicine", "临床医学", "Other" ], "urban_and_rural_planner": [ "Urban and Rural Planner", "注册城乡规划师", "Other" ], "accountant": [ "Accountant", "注册会计师", "Other" ], "fire_engineer": [ "Fire Engineer", "注册消防工程师", "Other" ], "environmental_impact_assessment_engineer": [ "Environmental Impact Assessment Engineer", "环境影响评价工程师", "Other" ], "tax_accountant": [ "Tax Accountant", "税务师", "Other" ], "physician": [ "Physician", "医师资格", "Other" ] } ================================================ FILE: src/resources/eval/ceval/submission_example.json ================================================ { "chinese_language_and_literature": { "0": "A", "1": "B", "2": "B" }, "clinical_medicine": { "0": "C", "1": "D", "2": "C" }, "sports_science": { "0": "D", "1": "C", "2": "B" }, "civil_servant": { "0": "A", "1": "B", "2": "D" }, "veterinary_medicine": { "0": "D", "1": "B", "2": "C" }, "middle_school_chemistry": { "0": "A", "1": "A", "2": "C" }, "middle_school_history": { "0": "C", "1": "D", "2": "A" }, "middle_school_geography": { "0": "C", "1": "C", "2": "C" }, "middle_school_politics": { "0": "B", "1": "B", "2": "D" }, "middle_school_mathematics": { "0": "B", "1": "D", "2": "C" }, "middle_school_physics": { "0": "A", "1": "A", "2": "D" }, "middle_school_biology": { "0": "C", "1": "A", "2": "A" }, "physician": { "0": "B", "1": "D", "2": "D" }, "basic_medicine": { "0": "D", "1": "B", "2": "D" }, "modern_chinese_history": { "0": "C", "1": "C", "2": "D" }, "college_chemistry": { "0": "B", "1": "A", "2": "C" }, "college_physics": { "0": "C", "1": "C", "2": "A" }, "college_economics": { "0": "A", "1": "A", "2": "B" }, "college_programming": { "0": "D", "1": "D", "2": "D" }, "professional_tour_guide": { "0": "B", "1": "B", "2": "A" }, "business_administration": { "0": "C", "1": "B", "2": "A" }, "ideological_and_moral_cultivation": { "0": "A", "1": "C", "2": "A" }, "operating_system": { "0": "C", "1": "A", "2": "D" }, "teacher_qualification": { "0": "C", "1": "C", "2": "B" }, "education_science": { "0": "D", "1": "D", "2": "B" }, "plant_protection": { "0": "B", "1": "D", "2": "B" }, "probability_and_statistics": { "0": "C", "1": "C", "2": "A" }, "mao_zedong_thought": { "0": "B", "1": "B", "2": "B" }, "law": { "0": "D", "1": "B", "2": "B" }, "legal_professional": { "0": "D", "1": "C", "2": "D" }, "accountant": { "0": "A", "1": "B", "2": "D" }, "urban_and_rural_planner": { "0": "B", "1": "C", "2": "A" }, "fire_engineer": { "0": "A", "1": "C", "2": "C" }, "electrical_engineer": { "0": "B", "1": "A", "2": "A" }, "metrology_engineer": { "0": "B", "1": "B", "2": "C" }, "environmental_impact_assessment_engineer": { "0": "D", "1": "B", "2": "A" }, "discrete_mathematics": { "0": "C", "1": "A", "2": "C" }, "tax_accountant": { "0": "A", "1": "C", "2": "C" }, "art_studies": { "0": "A", "1": "B", "2": "A" }, "computer_architecture": { "0": "A", "1": "C", "2": "A" }, "computer_network": { "0": "D", "1": "B", "2": "C" }, "logic": { "0": "B", "1": "A", "2": "B" }, "marxism": { "0": "B", "1": "B", "2": "B" }, "high_school_chemistry": { "0": "B", "1": "C", "2": "D" }, "high_school_history": { "0": "A", "1": "D", "2": "A" }, "high_school_geography": { "0": "B", "1": "A", "2": "D" }, "high_school_politics": { "0": "B", "1": "A", "2": "B" }, "high_school_mathematics": { "0": "D", "1": "D", "2": "A" }, "high_school_physics": { "0": "C", "1": "C", "2": "C" }, "high_school_biology": { "0": "B", "1": "A", "2": "B" }, "high_school_chinese": { "0": "D", "1": "B", "2": "A" }, "advanced_mathematics": { "0": "C", "1": "D", "2": "A" } } ================================================ FILE: src/resources/eval/mmlu/subject_mapping.json ================================================ { "moral_scenarios": [ "Moral Scenarios", "Humanities" ], "us_foreign_policy": [ "US Foreign Policy", "Social Sciences" ], "public_relations": [ "Public Relations", "Social Sciences" ], "global_facts": [ "Global Facts", "Other" ], "electrical_engineering": [ "Electrical Engineering", "STEM" ], "astronomy": [ "Astronomy", "STEM" ], "business_ethics": [ "Business Ethics", "Other" ], "jurisprudence": [ "Jurisprudence", "Humanities" ], "high_school_chemistry": [ "High School Chemistry", "STEM" ], "college_physics": [ "College Physics", "STEM" ], "professional_psychology": [ "Professional Psychology", "Social Sciences" ], "marketing": [ "Marketing", "Other" ], "management": [ "Management", "Other" ], "virology": [ "Virology", "Other" ], "international_law": [ "International Law", "Humanities" ], "high_school_macroeconomics": [ "High School Macroeconomics", "Social Sciences" ], "prehistory": [ "Prehistory", "Humanities" ], "abstract_algebra": [ "Abstract Algebra", "STEM" ], "high_school_physics": [ "High School Physics", "STEM" ], "formal_logic": [ "Formal Logic", "Humanities" ], "college_medicine": [ "College Medicine", "Other" ], "high_school_us_history": [ "High School US History", "Humanities" ], "moral_disputes": [ "Moral Disputes", "Humanities" ], "high_school_european_history": [ "High School European History", "Humanities" ], "clinical_knowledge": [ "Clinical Knowledge", "Other" ], "world_religions": [ "World Religions", "Humanities" ], "high_school_microeconomics": [ "High School Microeconomics", "Social Sciences" ], "professional_law": [ "Professional Law", "Humanities" ], "human_aging": [ "Human Aging", "Other" ], "medical_genetics": [ "Medical Genetics", "Other" ], "high_school_geography": [ "High School Geography", "Social Sciences" ], "high_school_government_and_politics": [ "High School Government and Politics", "Social Sciences" ], "anatomy": [ "Anatomy", "STEM" ], "sociology": [ "Sociology", "Social Sciences" ], "logical_fallacies": [ "Logical Fallacies", "Humanities" ], "high_school_computer_science": [ "High School Computer Science", "STEM" ], "miscellaneous": [ "Miscellaneous", "Other" ], "high_school_world_history": [ "High School World History", "Humanities" ], "professional_medicine": [ "Professional Medicine", "Other" ], "high_school_biology": [ "High School Biology", "STEM" ], "high_school_statistics": [ "High School Statistics", "STEM" ], "college_chemistry": [ "College Chemistry", "STEM" ], "nutrition": [ "Nutrition", "Other" ], "econometrics": [ "Econometrics", "Social Sciences" ], "human_sexuality": [ "Human Sexuality", "Social Sciences" ], "security_studies": [ "Security Studies", "Social Sciences" ], "philosophy": [ "Philosophy", "Humanities" ], "elementary_mathematics": [ "Elementary Mathematics", "STEM" ], "college_biology": [ "College Biology", "STEM" ], "college_computer_science": [ "College Computer Science", "STEM" ], "machine_learning": [ "Machine Learning", "STEM" ], "professional_accounting": [ "Professional Accounting", "Other" ], "college_mathematics": [ "College Mathematics", "STEM" ], "high_school_mathematics": [ "High School Mathematics", "STEM" ], "high_school_psychology": [ "High School Psychology", "Social Sciences" ], "conceptual_physics": [ "Conceptual Physics", "STEM" ], "computer_security": [ "Computer Security", "STEM" ] } ================================================ FILE: src/resources/models/baichuan-13B-base/config.json ================================================ { "_from_model_config": true, "architectures": [ "BaichuanForCausalLM" ], "auto_map": { "AutoConfig": "configuration_baichuan.BaichuanConfig", "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM" }, "bos_token_id": 1, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "silu", "hidden_size": 5120, "initializer_range": 0.02, "intermediate_size": 13696, "model_max_length": 4096, "model_type": "baichuan", "num_attention_heads": 40, "num_hidden_layers": 40, "pad_token_id": 0, "rms_norm_eps": 1e-06, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.29.2", "use_cache": true, "vocab_size": 64000 } ================================================ FILE: src/resources/models/baichuan-13B-base/configuration_baichuan.py ================================================ # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. from transformers.configuration_utils import PretrainedConfig class BaichuanConfig(PretrainedConfig): model_type = "baichuan" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=64000, hidden_size=5120, intermediate_size=13696, num_hidden_layers=40, num_attention_heads=40, hidden_act="silu", model_max_length=4096, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, pad_token_id=0, bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, gradient_checkpointing=False, **kwargs, ): self.vocab_size = vocab_size self.model_max_length = model_max_length self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.gradient_checkpointing = gradient_checkpointing, super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) ================================================ FILE: src/resources/models/baichuan-13B-base/modeling_baichuan.py ================================================ # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. import math from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint from torch.nn import CrossEntropyLoss from transformers import PreTrainedModel from transformers.activations import ACT2FN from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.utils import logging from transformers.generation.utils import GenerationConfig from .configuration_baichuan import BaichuanConfig logger = logging.get_logger(__name__) def _get_interleave(n): def _get_interleave_power_of_2(n): start = (2 ** (-2 ** -(math.log2(n) - 3))) ratio = start return [start * ratio ** i for i in range(n)] if math.log2(n).is_integer(): return _get_interleave_power_of_2(n) else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) return _get_interleave_power_of_2(closest_power_of_2) + \ _get_interleave(2 * closest_power_of_2)[0::2][:n - closest_power_of_2] def _fill_with_neg_inf(t): """FP16-compatible function that fills a tensor with -inf.""" return t.float().fill_(float("-inf")).type_as(t) def _gen_alibi_mask(n_head, max_pos, alibi_mask=None): slopes = torch.Tensor(_get_interleave(n_head)) alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_pos).unsqueeze(0).unsqueeze(0).expand( n_head, -1, -1) alibi = alibi.view(n_head, 1, max_pos) if alibi_mask is None: alibi_mask = torch.triu( _fill_with_neg_inf(torch.zeros([max_pos, max_pos])), 1 ) alibi_mask = alibi_mask.unsqueeze(0) + alibi return alibi_mask class RMSNorm(torch.nn.Module): def __init__(self, hidden_size, epsilon=1e-6): super().__init__() self.weight = torch.nn.Parameter(torch.empty(hidden_size)) self.epsilon = epsilon def forward(self, hidden_states): variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon) # convert into half-precision if self.weight.dtype in [torch.float16, torch.bfloat16]: hidden_states = hidden_states.to(self.weight.dtype) return self.weight * hidden_states class MLP(torch.nn.Module): def __init__( self, hidden_size: int, intermediate_size: int, hidden_act: str, ): super().__init__() self.gate_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False) self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False) self.up_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False) self.act_fn = ACT2FN[hidden_act] def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) class BaichuanAttention(torch.nn.Module): def __init__(self, config: BaichuanConfig): super().__init__() self.config = config self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads self.max_position_embeddings = config.model_max_length if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( f"hidden_size {self.hidden_size} is not divisible by num_heads {self.num_heads}" ) self.W_pack = torch.nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) self.o_proj = torch.nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() proj = self.W_pack(hidden_states) proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] if past_key_value is not None: # reuse k, v, self_attention key_states = torch.cat([past_key_value[0], key_states], dim=2) value_states = torch.cat([past_key_value[1], value_states], dim=2) past_key_value = (key_states, value_states) if use_cache else None attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attention_mask is not None: if attn_weights.size(-2) == 1: attention_mask = attention_mask[:, -1:, :] attn_weights = attn_weights + attention_mask.unsqueeze(0) attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2) attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None return attn_output, attn_weights, past_key_value class BaichuanLayer(torch.nn.Module): def __init__(self, config: BaichuanConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = BaichuanAttention(config=config) self.mlp = MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, ) self.input_layernorm = RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) if use_cache: outputs += (present_key_value,) return outputs class BaichuanPreTrainedModel(PreTrainedModel): config_class = BaichuanConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["BaichuanLayer"] _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range if isinstance(module, torch.nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, torch.nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, BaichuanModel): module.gradient_checkpointing = value class BaichuanModel(BaichuanPreTrainedModel): def __init__(self, config: BaichuanConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.n_head = config.num_attention_heads self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) self.layers = torch.nn.ModuleList([BaichuanLayer(config) for _ in range(config.num_hidden_layers)]) self.norm = RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps) self.gradient_checkpointing = config.gradient_checkpointing self.post_init() self.max_cache_pos = config.model_max_length self.first_run = True def get_input_embeddings(self): return self.embed_tokens def set_input_embeddings(self, value): self.embed_tokens = value def get_alibi_mask(self, tensor, seq_length_with_past, attention_mask): if self.first_run: self.first_run = False self.register_buffer( "future_mask", _gen_alibi_mask(self.n_head, self.max_cache_pos, attention_mask).to(tensor), persistent=False ) if seq_length_with_past > self.max_cache_pos: self.max_cache_pos = seq_length_with_past self.register_buffer( "future_mask", _gen_alibi_mask(self.n_head, self.max_cache_pos, attention_mask).to(tensor), persistent=False ) mask = self.future_mask[:self.n_head, :seq_length_with_past, :seq_length_with_past] return mask def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, ) -> Union[Tuple, BaseModelOutputWithPast]: if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot provide both input_ids and inputs_embeds simultaneously") elif input_ids is not None: batch_size, seq_length = input_ids.shape elif inputs_embeds is not None: batch_size, seq_length, _ = inputs_embeds.shape else: raise ValueError("You need to provide input_ids or inputs_embeds") seq_length_with_past = seq_length if past_key_values is not None: past_key_values_length = past_key_values[0][0].shape[2] seq_length_with_past = seq_length_with_past + past_key_values_length if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) # embed positions attention_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past, attention_mask) hidden_states = inputs_embeds if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None next_decoder_cache = () if use_cache else None for idx, decoder_layer in enumerate(self.layers): if output_hidden_states: all_hidden_states += (hidden_states,) past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs, output_attentions, None) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, None, ) else: layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) if output_attentions: all_self_attns += (layer_outputs[1],) hidden_states = self.norm(hidden_states) # add hidden states from the last decoder layer if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, ) class BaichuanForCausalLM(BaichuanPreTrainedModel): def __init__(self, config): super().__init__(config) self.model = BaichuanModel(config) self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, **kwargs ) -> Union[Tuple, CausalLMOutputWithPast]: # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = outputs[0] logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.config.vocab_size) shift_labels = shift_labels.view(-1) # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs ): if past_key_values: input_ids = input_ids[:, -1:] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step if inputs_embeds is not None and past_key_values is None: model_inputs = {"inputs_embeds": inputs_embeds} else: model_inputs = {"input_ids": input_ids} model_inputs.update( { "past_key_values": past_key_values, "use_cache": kwargs.get("use_cache"), } ) return model_inputs @staticmethod def _reorder_cache(past_key_values, beam_idx): return tuple( tuple(past_state.index_select(0, beam_idx) for past_state in layer_past) for layer_past in past_key_values ) def quantize(self, bits: int): try: from .quantizer import QLinear except ImportError: raise ImportError( f"Needs QLinear to run quantize." ) for layer in self.model.layers: layer.self_attn.W_pack = QLinear( bits=bits, weight=layer.self_attn.W_pack.weight, bias = None, ) layer.self_attn.o_proj = QLinear( bits=bits, weight=layer.self_attn.o_proj.weight, bias = None, ) layer.mlp.gate_proj = QLinear( bits=bits, weight=layer.mlp.gate_proj.weight, bias = None, ) layer.mlp.down_proj = QLinear( bits=bits, weight=layer.mlp.down_proj.weight, bias = None, ) layer.mlp.up_proj = QLinear( bits=bits, weight=layer.mlp.up_proj.weight, bias = None, ) return self def _build_chat_input(self, tokenizer, messages: List[dict], max_new_tokens: int=0): max_new_tokens = max_new_tokens or self.generation_config.max_new_tokens max_input_tokens = self.config.model_max_length - max_new_tokens max_input_tokens = max(self.config.model_max_length // 2, max_input_tokens) total_input, round_input = [], [] for i, message in enumerate(messages[::-1]): content_tokens = tokenizer.encode(message['content']) if message['role'] == 'user': round_input = [self.generation_config.user_token_id] + content_tokens + round_input if total_input and len(total_input) + len(round_input) > max_input_tokens: break else: total_input = round_input + total_input if len(total_input) >= max_input_tokens: break else: round_input = [] elif message['role'] == 'assistant': round_input = [ self.generation_config.assistant_token_id ] + content_tokens + [ self.generation_config.eos_token_id ] + round_input else: raise ValueError(f"message role not supported yet: {message['role']}") total_input = total_input[-max_input_tokens:] # truncate left total_input.append(self.generation_config.assistant_token_id) total_input = torch.LongTensor([total_input]).to(self.device) return total_input @torch.no_grad() def chat(self, tokenizer, messages: List[dict], stream=False, generation_config: Optional[GenerationConfig]=None): generation_config = generation_config or self.generation_config input_ids = self._build_chat_input(tokenizer, messages, generation_config.max_new_tokens) if stream: from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig self.__class__.generate = NewGenerationMixin.generate self.__class__.sample_stream = NewGenerationMixin.sample_stream stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True) def stream_generator(): outputs = [] for token in self.generate(input_ids, generation_config=stream_config): outputs.append(token.item()) yield tokenizer.decode(outputs, skip_special_tokens=True) return stream_generator() else: self.__class__.generate = PreTrainedModel.generate # disable stream outputs = self.generate(input_ids, generation_config=generation_config) response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True) return response ================================================ FILE: src/resources/models/baichuan-13B-base/tokenization_baichuan.py ================================================ # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. import os from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.utils import logging logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": {}, "tokenizer_file": {}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} class BaichuanTokenizer(PreTrainedTokenizer): """ Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding. Args: vocab_file (`str`): Path to the vocabulary file. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file, unk_token="", bos_token="", eos_token="", pad_token=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, add_bos_token=True, add_eos_token=False, clean_up_tokenization_spaces=False, **kwargs, ): self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, add_bos_token=add_bos_token, add_eos_token=add_eos_token, sp_model_kwargs=self.sp_model_kwargs, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) self.vocab_file = vocab_file self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) @property def vocab_size(self): """Returns vocab size""" return self.sp_model.get_piece_size() def get_vocab(self): """Returns vocab as a dict""" vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text): """Returns a tokenized string.""" return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" token = self.sp_model.IdToPiece(index) return token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" current_sub_tokens = [] out_string = "" prev_is_special = False for i, token in enumerate(tokens): # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: if not prev_is_special and i != 0: out_string += " " out_string += self.sp_model.decode(current_sub_tokens) + token prev_is_special = True current_sub_tokens = [] else: current_sub_tokens.append(token) prev_is_special = False out_string += self.sp_model.decode(current_sub_tokens) return out_string def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. Returns: `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) return (out_vocab_file,) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] output = bos_token_id + token_ids_0 + eos_token_id if token_ids_1 is not None: output = output + bos_token_id + token_ids_1 + eos_token_id return output def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True ) bos_token_id = [1] if self.add_bos_token else [] eos_token_id = [1] if self.add_eos_token else [] if token_ids_1 is None: return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id return ( bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id ) def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format: ``` 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | ``` if token_ids_1 is None, only returns the first portion of the mask (0s). Args: token_ids_0 (`List[int]`): List of ids. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). """ bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) if token_ids_1 is not None: output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) return output ================================================ FILE: src/resources/models/baichuan-13B-base/tokenizer_config.json ================================================ { "add_bos_token": true, "add_eos_token": false, "auto_map": { "AutoTokenizer": [ "tokenization_baichuan.BaichuanTokenizer", null ] }, "bos_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": true }, "clean_up_tokenization_spaces": false, "eos_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": true }, "model_max_length": 4096, "pad_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": true }, "sp_model_kwargs": {}, "tokenizer_class": "BaichuanTokenizer", "unk_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": true } } ================================================ FILE: src/resources/models/baichuan-13B-chat/config.json ================================================ { "_from_model_config": true, "architectures": [ "BaichuanForCausalLM" ], "auto_map": { "AutoConfig": "configuration_baichuan.BaichuanConfig", "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM" }, "bos_token_id": 1, "eos_token_id": 2, "hidden_act": "silu", "hidden_size": 5120, "initializer_range": 0.02, "intermediate_size": 13696, "model_max_length": 4096, "model_type": "baichuan", "num_attention_heads": 40, "num_hidden_layers": 40, "pad_token_id": 0, "rms_norm_eps": 1e-06, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.29.2", "use_cache": true, "vocab_size": 64000 } ================================================ FILE: src/resources/models/baichuan-13B-chat/configuration_baichuan.py ================================================ # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. from transformers.configuration_utils import PretrainedConfig class BaichuanConfig(PretrainedConfig): model_type = "baichuan" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=64000, hidden_size=5120, intermediate_size=13696, num_hidden_layers=40, num_attention_heads=40, hidden_act="silu", model_max_length=4096, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, pad_token_id=0, bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, gradient_checkpointing=False, **kwargs, ): self.vocab_size = vocab_size self.model_max_length = model_max_length self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.gradient_checkpointing = gradient_checkpointing, super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) ================================================ FILE: src/resources/models/baichuan-13B-chat/modeling_baichuan.py ================================================ # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. import math from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint from torch.nn import CrossEntropyLoss from transformers import PreTrainedModel from transformers.activations import ACT2FN from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.utils import logging from transformers.generation.utils import GenerationConfig from .configuration_baichuan import BaichuanConfig logger = logging.get_logger(__name__) def _get_interleave(n): def _get_interleave_power_of_2(n): start = (2 ** (-2 ** -(math.log2(n) - 3))) ratio = start return [start * ratio ** i for i in range(n)] if math.log2(n).is_integer(): return _get_interleave_power_of_2(n) else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) return _get_interleave_power_of_2(closest_power_of_2) + \ _get_interleave(2 * closest_power_of_2)[0::2][:n - closest_power_of_2] def _fill_with_neg_inf(t): """FP16-compatible function that fills a tensor with -inf.""" return t.float().fill_(float("-inf")).type_as(t) def _gen_alibi_mask(n_head, max_pos, alibi_mask=None): slopes = torch.Tensor(_get_interleave(n_head)) alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_pos).unsqueeze(0).unsqueeze(0).expand( n_head, -1, -1) alibi = alibi.view(n_head, 1, max_pos) if alibi_mask is None: alibi_mask = torch.triu( _fill_with_neg_inf(torch.zeros([max_pos, max_pos])), 1 ) alibi_mask = alibi_mask.unsqueeze(0) + alibi return alibi_mask class RMSNorm(torch.nn.Module): def __init__(self, hidden_size, epsilon=1e-6): super().__init__() self.weight = torch.nn.Parameter(torch.empty(hidden_size)) self.epsilon = epsilon def forward(self, hidden_states): variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon) # convert into half-precision if self.weight.dtype in [torch.float16, torch.bfloat16]: hidden_states = hidden_states.to(self.weight.dtype) return self.weight * hidden_states class MLP(torch.nn.Module): def __init__( self, hidden_size: int, intermediate_size: int, hidden_act: str, ): super().__init__() self.gate_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False) self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False) self.up_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False) self.act_fn = ACT2FN[hidden_act] def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) class BaichuanAttention(torch.nn.Module): def __init__(self, config: BaichuanConfig): super().__init__() self.config = config self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads self.max_position_embeddings = config.model_max_length if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( f"hidden_size {self.hidden_size} is not divisible by num_heads {self.num_heads}" ) self.W_pack = torch.nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) self.o_proj = torch.nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() proj = self.W_pack(hidden_states) proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] if past_key_value is not None: # reuse k, v, self_attention key_states = torch.cat([past_key_value[0], key_states], dim=2) value_states = torch.cat([past_key_value[1], value_states], dim=2) past_key_value = (key_states, value_states) if use_cache else None attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attention_mask is not None: if attn_weights.size(-2) == 1: attention_mask = attention_mask[:, -1:, :] attn_weights = attn_weights + attention_mask.unsqueeze(0) attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2) attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None return attn_output, attn_weights, past_key_value class BaichuanLayer(torch.nn.Module): def __init__(self, config: BaichuanConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = BaichuanAttention(config=config) self.mlp = MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, ) self.input_layernorm = RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) if use_cache: outputs += (present_key_value,) return outputs class BaichuanPreTrainedModel(PreTrainedModel): config_class = BaichuanConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["BaichuanLayer"] _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range if isinstance(module, torch.nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, torch.nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, BaichuanModel): module.gradient_checkpointing = value class BaichuanModel(BaichuanPreTrainedModel): def __init__(self, config: BaichuanConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.n_head = config.num_attention_heads self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) self.layers = torch.nn.ModuleList([BaichuanLayer(config) for _ in range(config.num_hidden_layers)]) self.norm = RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps) self.gradient_checkpointing = config.gradient_checkpointing self.post_init() self.max_cache_pos = config.model_max_length self.first_run = True def get_input_embeddings(self): return self.embed_tokens def set_input_embeddings(self, value): self.embed_tokens = value def get_alibi_mask(self, tensor, seq_length_with_past, attention_mask): if self.first_run: self.first_run = False self.register_buffer( "future_mask", _gen_alibi_mask(self.n_head, self.max_cache_pos, attention_mask).to(tensor), persistent=False ) if seq_length_with_past > self.max_cache_pos: self.max_cache_pos = seq_length_with_past self.register_buffer( "future_mask", _gen_alibi_mask(self.n_head, self.max_cache_pos, attention_mask).to(tensor), persistent=False ) mask = self.future_mask[:self.n_head, :seq_length_with_past, :seq_length_with_past] return mask def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, ) -> Union[Tuple, BaseModelOutputWithPast]: if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot provide both input_ids and inputs_embeds simultaneously") elif input_ids is not None: batch_size, seq_length = input_ids.shape elif inputs_embeds is not None: batch_size, seq_length, _ = inputs_embeds.shape else: raise ValueError("You need to provide input_ids or inputs_embeds") seq_length_with_past = seq_length if past_key_values is not None: past_key_values_length = past_key_values[0][0].shape[2] seq_length_with_past = seq_length_with_past + past_key_values_length if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) # embed positions attention_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past, attention_mask) hidden_states = inputs_embeds if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None next_decoder_cache = () if use_cache else None for idx, decoder_layer in enumerate(self.layers): if output_hidden_states: all_hidden_states += (hidden_states,) past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs, output_attentions, None) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, None, ) else: layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) if output_attentions: all_self_attns += (layer_outputs[1],) hidden_states = self.norm(hidden_states) # add hidden states from the last decoder layer if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, ) class BaichuanForCausalLM(BaichuanPreTrainedModel): def __init__(self, config): super().__init__(config) self.model = BaichuanModel(config) self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, **kwargs ) -> Union[Tuple, CausalLMOutputWithPast]: # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = outputs[0] logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.config.vocab_size) shift_labels = shift_labels.view(-1) # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs ): if past_key_values: input_ids = input_ids[:, -1:] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step if inputs_embeds is not None and past_key_values is None: model_inputs = {"inputs_embeds": inputs_embeds} else: model_inputs = {"input_ids": input_ids} model_inputs.update( { "past_key_values": past_key_values, "use_cache": kwargs.get("use_cache"), } ) return model_inputs @staticmethod def _reorder_cache(past_key_values, beam_idx): return tuple( tuple(past_state.index_select(0, beam_idx) for past_state in layer_past) for layer_past in past_key_values ) def quantize(self, bits: int): try: from .quantizer import QLinear except ImportError: raise ImportError( f"Needs QLinear to run quantize." ) for layer in self.model.layers: layer.self_attn.W_pack = QLinear( bits=bits, weight=layer.self_attn.W_pack.weight, bias = None, ) layer.self_attn.o_proj = QLinear( bits=bits, weight=layer.self_attn.o_proj.weight, bias = None, ) layer.mlp.gate_proj = QLinear( bits=bits, weight=layer.mlp.gate_proj.weight, bias = None, ) layer.mlp.down_proj = QLinear( bits=bits, weight=layer.mlp.down_proj.weight, bias = None, ) layer.mlp.up_proj = QLinear( bits=bits, weight=layer.mlp.up_proj.weight, bias = None, ) return self def _build_chat_input(self, tokenizer, messages: List[dict], max_new_tokens: int=0): max_new_tokens = max_new_tokens or self.generation_config.max_new_tokens max_input_tokens = self.config.model_max_length - max_new_tokens max_input_tokens = max(self.config.model_max_length // 2, max_input_tokens) total_input, round_input = [], [] for i, message in enumerate(messages[::-1]): content_tokens = tokenizer.encode(message['content']) if message['role'] == 'user': round_input = [self.generation_config.user_token_id] + content_tokens + round_input if total_input and len(total_input) + len(round_input) > max_input_tokens: break else: total_input = round_input + total_input if len(total_input) >= max_input_tokens: break else: round_input = [] elif message['role'] == 'assistant': round_input = [ self.generation_config.assistant_token_id ] + content_tokens + [ self.generation_config.eos_token_id ] + round_input else: raise ValueError(f"message role not supported yet: {message['role']}") total_input = total_input[-max_input_tokens:] # truncate left total_input.append(self.generation_config.assistant_token_id) total_input = torch.LongTensor([total_input]).to(self.device) return total_input @torch.no_grad() def chat(self, tokenizer, messages: List[dict], stream=False, generation_config: Optional[GenerationConfig]=None): generation_config = generation_config or self.generation_config input_ids = self._build_chat_input(tokenizer, messages, generation_config.max_new_tokens) if stream: from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig self.__class__.generate = NewGenerationMixin.generate self.__class__.sample_stream = NewGenerationMixin.sample_stream stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True) def stream_generator(): outputs = [] for token in self.generate(input_ids, generation_config=stream_config): outputs.append(token.item()) yield tokenizer.decode(outputs, skip_special_tokens=True) return stream_generator() else: self.__class__.generate = PreTrainedModel.generate # disable stream outputs = self.generate(input_ids, generation_config=generation_config) response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True) return response ================================================ FILE: src/resources/models/baichuan-13B-chat/tokenization_baichuan.py ================================================ # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. import os from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.utils import logging logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": {}, "tokenizer_file": {}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} class BaichuanTokenizer(PreTrainedTokenizer): """ Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding. Args: vocab_file (`str`): Path to the vocabulary file. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file, unk_token="", bos_token="", eos_token="", pad_token=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, add_bos_token=True, add_eos_token=False, clean_up_tokenization_spaces=False, **kwargs, ): self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, add_bos_token=add_bos_token, add_eos_token=add_eos_token, sp_model_kwargs=self.sp_model_kwargs, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) self.vocab_file = vocab_file self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) @property def vocab_size(self): """Returns vocab size""" return self.sp_model.get_piece_size() def get_vocab(self): """Returns vocab as a dict""" vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text): """Returns a tokenized string.""" return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" token = self.sp_model.IdToPiece(index) return token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" current_sub_tokens = [] out_string = "" prev_is_special = False for i, token in enumerate(tokens): # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: if not prev_is_special and i != 0: out_string += " " out_string += self.sp_model.decode(current_sub_tokens) + token prev_is_special = True current_sub_tokens = [] else: current_sub_tokens.append(token) prev_is_special = False out_string += self.sp_model.decode(current_sub_tokens) return out_string def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. Returns: `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) return (out_vocab_file,) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] output = bos_token_id + token_ids_0 + eos_token_id if token_ids_1 is not None: output = output + bos_token_id + token_ids_1 + eos_token_id return output def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True ) bos_token_id = [1] if self.add_bos_token else [] eos_token_id = [1] if self.add_eos_token else [] if token_ids_1 is None: return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id return ( bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id ) def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format: ``` 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | ``` if token_ids_1 is None, only returns the first portion of the mask (0s). Args: token_ids_0 (`List[int]`): List of ids. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). """ bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) if token_ids_1 is not None: output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) return output ================================================ FILE: src/resources/models/baichuan-13B-chat/tokenizer_config.json ================================================ { "add_bos_token": true, "add_eos_token": false, "auto_map": { "AutoTokenizer": [ "tokenization_baichuan.BaichuanTokenizer", null ] }, "bos_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": true }, "clean_up_tokenization_spaces": false, "eos_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": true }, "model_max_length": 4096, "pad_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": true }, "sp_model_kwargs": {}, "tokenizer_class": "BaichuanTokenizer", "unk_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": true } } ================================================ FILE: src/resources/models/baichuan-7B/config.json ================================================ { "architectures": [ "BaiChuanForCausalLM" ], "auto_map": { "AutoConfig": "configuration_baichuan.BaiChuanConfig", "AutoModelForCausalLM": "modeling_baichuan.BaiChuanForCausalLM" }, "bos_token_id": 1, "eos_token_id": 2, "hidden_act": "silu", "hidden_size": 4096, "initializer_range": 0.02, "intermediate_size": 11008, "max_position_embeddings": 4096, "model_type": "baichuan", "num_attention_heads": 32, "num_hidden_layers": 32, "pad_token_id": 0, "rms_norm_eps": 1e-06, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.29.1", "use_cache": true, "vocab_size": 64000 } ================================================ FILE: src/resources/models/baichuan-7B/configuration_baichuan.py ================================================ # coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its # original forms to accommodate minor architectural differences compared # to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) class BaiChuanConfig(PretrainedConfig): model_type = "baichuan" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=64000, hidden_size=4096, intermediate_size=11008, num_hidden_layers=32, num_attention_heads=32, hidden_act="silu", max_position_embeddings=4096, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, pad_token_id=0, bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) ================================================ FILE: src/resources/models/baichuan-7B/modeling_baichuan.py ================================================ # coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its # original forms to accommodate minor architectural differences compared # to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .configuration_baichuan import BaiChuanConfig from transformers import PreTrainedModel, add_start_docstrings from transformers.activations import ACT2FN from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, \ SequenceClassifierOutputWithPast from transformers.utils import logging, add_start_docstrings_to_model_forward, replace_return_docstrings import math from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss logger = logging.get_logger(__name__) # Copied from transformers.models.bart.modeling_bart._make_causal_mask def _make_causal_mask( input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 ): """ Make causal mask used for bi-directional self-attention. """ bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) mask_cond = torch.arange(mask.size(-1), device=device) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) # Copied from transformers.models.bart.modeling_bart._expand_mask def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) class RMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ RMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) # convert into half-precision if necessary if self.weight.dtype in [torch.float16, torch.bfloat16]: hidden_states = hidden_states.to(self.weight.dtype) return self.weight * hidden_states class RotaryEmbedding(torch.nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) self.register_buffer("inv_freq", inv_freq) # Build here to make `torch.jit.trace` work. self.max_seq_len_cached = max_position_embeddings t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) freqs = torch.einsum("i,j->ij", t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation emb = torch.cat((freqs, freqs), dim=-1) self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) def forward(self, x, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. if seq_len > self.max_seq_len_cached: self.max_seq_len_cached = seq_len t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) freqs = torch.einsum("i,j->ij", t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation emb = torch.cat((freqs, freqs), dim=-1).to(x.device) self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) return ( self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), ) def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2:] return torch.cat((-x2, x1), dim=-1) def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed class MLP(nn.Module): def __init__( self, hidden_size: int, intermediate_size: int, hidden_act: str, ): super().__init__() self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.act_fn = ACT2FN[hidden_act] def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) class Attention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config: BaiChuanConfig): super().__init__() self.config = config self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads self.max_position_embeddings = config.max_position_embeddings if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" f" and `num_heads`: {self.num_heads})." ) # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) # self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) # self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() proj = self.W_pack(hidden_states) proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) # batch_size x source_len x hidden_size key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) # batch_size x target_len x head_size value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) # batch_size x source_len x hidden_size # query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) # key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) # value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) # [bsz, nh, t, hd] if past_key_value is not None: # reuse k, v, self_attention key_states = torch.cat([past_key_value[0], key_states], dim=2) value_states = torch.cat([past_key_value[1], value_states], dim=2) past_key_value = (key_states, value_states) if use_cache else None attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" f" {attn_weights.size()}" ) if attention_mask is not None: if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights + attention_mask attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) attn_output = torch.matmul(attn_weights, value_states) if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): raise ValueError( f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" f" {attn_output.size()}" ) attn_output = attn_output.transpose(1, 2) attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None return attn_output, attn_weights, past_key_value class DecoderLayer(nn.Module): def __init__(self, config: BaiChuanConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = Attention(config=config) self.mlp = MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`, *optional*): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states """ residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: outputs += (present_key_value,) return outputs class PreTrainedModel(PreTrainedModel): config_class = BaiChuanConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["DecoderLayer"] _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, Model): module.gradient_checkpointing = value class Model(PreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DecoderLayer`] Args: config: BaiChuanConfig """ def __init__(self, config: BaiChuanConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.embed_tokens def set_input_embeddings(self, value): self.embed_tokens = value # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] combined_attention_mask = None if input_shape[-1] > 1: combined_attention_mask = _make_causal_mask( input_shape, inputs_embeds.dtype, device=inputs_embeds.device, past_key_values_length=past_key_values_length, ) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( inputs_embeds.device ) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask ) return combined_attention_mask def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif input_ids is not None: batch_size, seq_length = input_ids.shape elif inputs_embeds is not None: batch_size, seq_length, _ = inputs_embeds.shape else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") seq_length_with_past = seq_length past_key_values_length = 0 if past_key_values is not None: past_key_values_length = past_key_values[0][0].shape[2] seq_length_with_past = seq_length_with_past + past_key_values_length if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange( past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device ) position_ids = position_ids.unsqueeze(0).view(-1, seq_length) else: position_ids = position_ids.view(-1, seq_length).long() if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) # embed positions if attention_mask is None: attention_mask = torch.ones( (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device ) attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length ) hidden_states = inputs_embeds if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None next_decoder_cache = () if use_cache else None for idx, decoder_layer in enumerate(self.layers): if output_hidden_states: all_hidden_states += (hidden_states,) past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs, output_attentions, None) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids, None, ) else: layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) if output_attentions: all_self_attns += (layer_outputs[1],) hidden_states = self.norm(hidden_states) # add hidden states from the last decoder layer if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, ) class BaiChuanForCausalLM(PreTrainedModel): def __init__(self, config): super().__init__(config) self.model = Model(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.model.embed_tokens def set_input_embeddings(self, value): self.model.embed_tokens = value def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def set_decoder(self, decoder): self.model = decoder def get_decoder(self): return self.model def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. Returns: Example: ```python >>> from transformers import AutoTokenizer, ModelForCausalLM >>> model = ModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) >>> prompt = "Hey, are you consciours? Can you talk to me?" >>> inputs = tokenizer(prompt, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(inputs.input_ids, max_length=30) >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = outputs[0] logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.config.vocab_size) shift_labels = shift_labels.view(-1) # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs ): if past_key_values: input_ids = input_ids[:, -1:] position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past_key_values: position_ids = position_ids[:, -1].unsqueeze(-1) # if `inputs_embeds` are passed, we only want to use them in the 1st generation step if inputs_embeds is not None and past_key_values is None: model_inputs = {"inputs_embeds": inputs_embeds} else: model_inputs = {"input_ids": input_ids} model_inputs.update( { "position_ids": position_ids, "past_key_values": past_key_values, "use_cache": kwargs.get("use_cache"), "attention_mask": attention_mask, } ) return model_inputs @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () for layer_past in past_key_values: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past ================================================ FILE: src/resources/models/baichuan-7B/tokenization_baichuan.py ================================================ # coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its # original forms to accommodate minor architectural differences compared # to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.utils import logging logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": {}, "tokenizer_file": {}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} class BaiChuanTokenizer(PreTrainedTokenizer): """ Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding. Args: vocab_file (`str`): Path to the vocabulary file. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file, unk_token="", bos_token="", eos_token="", pad_token=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, add_bos_token=True, add_eos_token=False, clean_up_tokenization_spaces=False, **kwargs, ): self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, add_bos_token=add_bos_token, add_eos_token=add_eos_token, sp_model_kwargs=self.sp_model_kwargs, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) self.vocab_file = vocab_file self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) @property def vocab_size(self): """Returns vocab size""" return self.sp_model.get_piece_size() def get_vocab(self): """Returns vocab as a dict""" vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text): """Returns a tokenized string.""" return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" token = self.sp_model.IdToPiece(index) return token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" current_sub_tokens = [] out_string = "" prev_is_special = False for i, token in enumerate(tokens): # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: if not prev_is_special and i != 0: out_string += " " out_string += self.sp_model.decode(current_sub_tokens) + token prev_is_special = True current_sub_tokens = [] else: current_sub_tokens.append(token) prev_is_special = False out_string += self.sp_model.decode(current_sub_tokens) return out_string def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. Returns: `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) return (out_vocab_file,) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] output = bos_token_id + token_ids_0 + eos_token_id if token_ids_1 is not None: output = output + bos_token_id + token_ids_1 + eos_token_id return output def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True ) bos_token_id = [1] if self.add_bos_token else [] eos_token_id = [1] if self.add_eos_token else [] if token_ids_1 is None: return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id return ( bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id ) def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format: ``` 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | ``` if token_ids_1 is None, only returns the first portion of the mask (0s). Args: token_ids_0 (`List[int]`): List of ids. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). """ bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) if token_ids_1 is not None: output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) return output ================================================ FILE: src/resources/models/baichuan-7B/tokenizer_config.json ================================================ { "auto_map": { "AutoTokenizer": ["tokenization_baichuan.BaiChuanTokenizer", null] }, "add_bos_token": true, "add_eos_token": false, "bos_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": false }, "clean_up_tokenization_spaces": false, "eos_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": false }, "model_max_length": 1000000000000000019884624838656, "sp_model_kwargs": {}, "tokenizer_class": "BaiChuanTokenizer", "unk_token": { "__type": "AddedToken", "content": "", "lstrip": false, "normalized": true, "rstrip": false, "single_word": false } } ================================================ FILE: src/resources/models/chatglm-6B/README.md ================================================ --- language: - zh - en tags: - glm - chatglm - chatgpt --- Link to github: [here](https://github.com/sunzeyeah/RLHF) --- 本仓库由[THUDM/chatglm-6b](https://huggingface.co/THUDM/chatglm-6b) fork而来,原仓库实现了PyTorch版本的ChatGLM模型,该模型有60亿参数量,模型权重文件以FP16格式存储。 本仓库在原始代码的基础上进行了部分调整,以支持ChatGPT训练pipeline,具体实现可参考:[sunzeyeah/RLHF](https://github.com/sunzeyeah/RLHF). This repository is forked from [THUDM/chatglm-6b](https://huggingface.co/THUDM/chatglm-6b) that contains PyTorch implementation of ChatGLM model with 6 billion parameters pretrained weights (FP16 precision). It is slightly different from the original ChatGLM implementation to support the ChatGPT training pipeline in this github repo: [sunzeyeah/RLHF](https://github.com/sunzeyeah/RLHF). --- # 介绍 ChatGLM-6B 是一个开源的、支持中英双语问答的对话语言模型,基于 [General Language Model (GLM)](https://github.com/THUDM/GLM) 架构,具有 62 亿参数。结合模型量化技术,用户可以在消费级的显卡上进行本地部署(INT4 量化级别下最低只需 6GB 显存)。ChatGLM-6B 使用了和 [ChatGLM](https://chatglm.cn) 相同的技术,针对中文问答和对话进行了优化。经过约 1T 标识符的中英双语训练,辅以监督微调、反馈自助、人类反馈强化学习等技术的加持,62 亿参数的 ChatGLM-6B 已经能生成相当符合人类偏好的回答。 ChatGLM-6B is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework, with 6.2 billion parameters. With the quantization technique, users can deploy locally on consumer-grade graphics cards (only 6GB of GPU memory is required at the INT4 quantization level). ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue. The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning, feedback bootstrap, and reinforcement learning wit human feedback. With only about 6.2 billion parameters, the model is able to generate answers that are in line with human preference. --- # 软件依赖 ```shell pip install protobuf==3.20.0 transformers==4.26.1 icetk cpm_kernels ``` --- # 代码调用 可以通过如下代码调用 ChatGLM-6B 模型来生成对话: ```ipython >>> from transformers import AutoTokenizer, AutoModel >>> tokenizer = AutoTokenizer.from_pretrained("sunzeyeah/chatglm-6B", trust_remote_code=True) >>> model = AutoModel.from_pretrained("sunzeyeah/chatglm-6B", trust_remote_code=True).half().cuda() >>> response, history = model.chat(tokenizer, "你好", history=[]) >>> print(response) 你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。 >>> response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history) >>> print(response) 晚上睡不着可能会让你感到焦虑或不舒服,但以下是一些可以帮助你入睡的方法: 1. 制定规律的睡眠时间表:保持规律的睡眠时间表可以帮助你建立健康的睡眠习惯,使你更容易入睡。尽量在每天的相同时间上床,并在同一时间起床。 2. 创造一个舒适的睡眠环境:确保睡眠环境舒适,安静,黑暗且温度适宜。可以使用舒适的床上用品,并保持房间通风。 3. 放松身心:在睡前做些放松的活动,例如泡个热水澡,听些轻柔的音乐,阅读一些有趣的书籍等,有助于缓解紧张和焦虑,使你更容易入睡。 4. 避免饮用含有咖啡因的饮料:咖啡因是一种刺激性物质,会影响你的睡眠质量。尽量避免在睡前饮用含有咖啡因的饮料,例如咖啡,茶和可乐。 5. 避免在床上做与睡眠无关的事情:在床上做些与睡眠无关的事情,例如看电影,玩游戏或工作等,可能会干扰你的睡眠。 6. 尝试呼吸技巧:深呼吸是一种放松技巧,可以帮助你缓解紧张和焦虑,使你更容易入睡。试着慢慢吸气,保持几秒钟,然后缓慢呼气。 如果这些方法无法帮助你入睡,你可以考虑咨询医生或睡眠专家,寻求进一步的建议。 ``` ================================================ FILE: src/resources/models/chatglm-6B/config.json ================================================ { "_name_or_path": "sunzeyeah/chatglm-6B", "architectures": [ "ChatGLMModel" ], "auto_map": { "AutoConfig": "configuration_chatglm.ChatGLMConfig", "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration" }, "bos_token_id": 150004, "eos_token_id": 150005, "pad_token_id": 20003, "hidden_size": 4096, "inner_hidden_size": 16384, "layernorm_epsilon": 1e-05, "max_sequence_length": 2048, "model_type": "chatglm", "num_attention_heads": 32, "num_layers": 28, "position_encoding_2d": true, "torch_dtype": "float16", "transformers_version": "4.23.1", "use_cache": true, "vocab_size": 150528 } ================================================ FILE: src/resources/models/chatglm-6B/configuration_chatglm.py ================================================ """ ChatGLM model configuration """ from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) class ChatGLMConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`~ChatGLMModel`]. It is used to instantiate an ChatGLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 150528): Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`~ChatGLMModel`] or [`~TFChatGLMModel`]. hidden_size (`int`, *optional*, defaults to 4096): Dimension of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 28): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. inner_hidden_size (`int`, *optional*, defaults to 16384): Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. max_sequence_length (`int`, *optional*, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). layernorm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether the model should return the last key/values attentions (not used by all models). Example: ```python >>> from configuration_chatglm import ChatGLMConfig >>> from modeling_chatglm import ChatGLMModel >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration >>> configuration = ChatGLMConfig() >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration >>> model = ChatGLMModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ``` """ model_type = "chatglm" def __init__( self, vocab_size=150528, hidden_size=4096, num_layers=28, num_attention_heads=32, layernorm_epsilon=1e-5, use_cache=False, bos_token_id=150004, eos_token_id=150005, pad_token_id=0, max_sequence_length=2048, inner_hidden_size=16384, position_encoding_2d=True, quantization_bit=0, pre_seq_len=None, prefix_projection=False, **kwargs ): self.num_layers = num_layers self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.max_sequence_length = max_sequence_length self.layernorm_epsilon = layernorm_epsilon self.inner_hidden_size = inner_hidden_size self.use_cache = use_cache self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.pad_token_id = pad_token_id self.position_encoding_2d = position_encoding_2d self.quantization_bit = quantization_bit self.pre_seq_len = pre_seq_len self.prefix_projection = prefix_projection super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs ) ================================================ FILE: src/resources/models/chatglm-6B/modeling_chatglm.py ================================================ """ PyTorch ChatGLM model. """ import math import copy import os import warnings import re import sys import torch import torch.utils.checkpoint import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss, LayerNorm # from torch.nn.utils import skip_init from typing import Optional, Tuple, Union, List, Callable from transformers.utils import ( add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, ) from transformers.modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, BaseModelOutputWithPastAndCrossAttentions, ) from transformers.modeling_utils import PreTrainedModel from transformers.utils import logging from transformers.generation.logits_process import LogitsProcessor from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig from .configuration_chatglm import ChatGLMConfig # flags required to enable jit fusion kernels if sys.platform != 'darwin': torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) torch._C._jit_override_can_fuse_on_cpu(True) torch._C._jit_override_can_fuse_on_gpu(True) logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B" _CONFIG_FOR_DOC = "ChatGLM6BConfig" CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [ "THUDM/chatglm-6b", # See all ChatGLM-6B models at https://huggingface.co/models?filter=chatglm ] class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): scores.zero_() scores[..., 20005] = 5e4 return scores def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path): """Load tf checkpoints in a pytorch model.""" try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info(f"Converting TensorFlow checkpoint from {tf_path}") # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info(f"Loading TF weight {name} with shape {shape}") array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] for n in name ): logger.info(f"Skipping {'/'.join(name)}") continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info(f"Skipping {'/'.join(name)}") continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert ( pointer.shape == array.shape ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info(f"Initialize PyTorch weight {name}") pointer.data = torch.from_numpy(array) return model class PrefixEncoder(torch.nn.Module): """ The torch.nn model to encode the prefix Input shape: (batch-size, prefix-length) Output shape: (batch-size, prefix-length, 2*layers*hidden) """ def __init__(self, config): super().__init__() self.prefix_projection = config.prefix_projection if self.prefix_projection: # Use a two-layer MLP to encode the prefix self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size) self.trans = torch.nn.Sequential( torch.nn.Linear(config.hidden_size, config.hidden_size), torch.nn.Tanh(), torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2) ) else: self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2) def forward(self, prefix: torch.Tensor): if self.prefix_projection: prefix_tokens = self.embedding(prefix) past_key_values = self.trans(prefix_tokens) else: past_key_values = self.embedding(prefix) return past_key_values @torch.jit.script def gelu_impl(x): """OpenAI's gelu implementation.""" return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x))) def gelu(x): return gelu_impl(x) class RotaryEmbedding(torch.nn.Module): def __init__(self, dim, base=10000, precision=torch.half, learnable=False): super().__init__() inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim)) inv_freq = inv_freq.half() self.learnable = learnable if learnable: self.inv_freq = torch.nn.Parameter(inv_freq) self.max_seq_len_cached = None else: self.register_buffer('inv_freq', inv_freq) self.max_seq_len_cached = None self.cos_cached = None self.sin_cached = None self.precision = precision def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): pass def forward(self, x, seq_dim=1, seq_len=None): if seq_len is None: seq_len = x.shape[seq_dim] if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached): self.max_seq_len_cached = None if self.learnable else seq_len t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype) freqs = torch.einsum('i,j->ij', t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation emb = torch.cat((freqs, freqs), dim=-1).to(x.device) if self.precision == torch.bfloat16: emb = emb.float() # [sx, 1 (b * np), hn] cos_cached = emb.cos()[:, None, :] sin_cached = emb.sin()[:, None, :] if self.precision == torch.bfloat16: cos_cached = cos_cached.bfloat16() sin_cached = sin_cached.bfloat16() if self.learnable: return cos_cached, sin_cached self.cos_cached, self.sin_cached = cos_cached, sin_cached return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...] def _apply(self, fn): if self.cos_cached is not None: self.cos_cached = fn(self.cos_cached) if self.sin_cached is not None: self.sin_cached = fn(self.sin_cached) return super()._apply(fn) def rotate_half(x): x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:] return torch.cat((-x2, x1), dim=x1.ndim - 1) # dim=-1 triggers a bug in earlier torch versions @torch.jit.script def apply_rotary_pos_emb_index(q, k, cos, sin, position_id): # position_id: [sq, b], q, k: [sq, b, np, hn], cos: [sq, 1, hn] -> [sq, b, 1, hn] cos, sin = F.embedding(position_id, cos.squeeze(1)).unsqueeze(2), \ F.embedding(position_id, sin.squeeze(1)).unsqueeze(2) q, k = (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) return q, k def attention_fn( self, query_layer, key_layer, value_layer, attention_mask, hidden_size_per_partition, layer_id, layer_past=None, scaling_attention_score=True, use_cache=False, ): if layer_past is not None: past_key, past_value = layer_past[0], layer_past[1] key_layer = torch.cat((past_key, key_layer), dim=0) value_layer = torch.cat((past_value, value_layer), dim=0) # seqlen, batch, num_attention_heads, hidden_size_per_attention_head seq_len, b, nh, hidden_size = key_layer.shape if use_cache: present = (key_layer, value_layer) else: present = None query_key_layer_scaling_coeff = float(layer_id + 1) if scaling_attention_score: query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff) # =================================== # Raw attention scores. [b, np, s, s] # =================================== # [b, np, sq, sk] output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) # [sq, b, np, hn] -> [sq, b * np, hn] query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) # [sk, b, np, hn] -> [sk, b * np, hn] key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) matmul_result = torch.empty( output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, device=query_layer.device, ) matmul_result = torch.baddbmm( matmul_result, query_layer.transpose(0, 1), # [b * np, sq, hn] key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] beta=0.0, alpha=1.0, ) # change view to [b, np, sq, sk] attention_scores = matmul_result.view(*output_size) if self.scale_mask_softmax: self.scale_mask_softmax.scale = query_key_layer_scaling_coeff attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous()) else: if not (attention_mask == 0).all(): # if auto-regressive, skip attention_scores.masked_fill_(attention_mask, -10000.0) dtype = attention_scores.dtype attention_scores = attention_scores.float() attention_scores = attention_scores * query_key_layer_scaling_coeff attention_probs = F.softmax(attention_scores, dim=-1) attention_probs = attention_probs.type(dtype) # ========================= # Context layer. [sq, b, hp] # ========================= # value_layer -> context layer. # [sk, b, np, hn] --> [b, np, sq, hn] # context layer shape: [b, np, sq, hn] output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) # change view [sk, b * np, hn] value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) # change view [b * np, sq, sk] attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) # matmul: [b * np, sq, hn] context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) # change view [b, np, sq, hn] context_layer = context_layer.view(*output_size) # [b, np, sq, hn] --> [sq, b, np, hn] context_layer = context_layer.permute(2, 0, 1, 3).contiguous() # [sq, b, np, hn] --> [sq, b, hp] new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, present, attention_probs) return outputs class SelfAttention(torch.nn.Module): def __init__(self, hidden_size, num_attention_heads, layer_id, hidden_size_per_attention_head=None, bias=True, params_dtype=torch.float, position_encoding_2d=True): super(SelfAttention, self).__init__() self.layer_id = layer_id self.hidden_size = hidden_size self.hidden_size_per_partition = hidden_size self.num_attention_heads = num_attention_heads self.num_attention_heads_per_partition = num_attention_heads self.position_encoding_2d = position_encoding_2d self.rotary_emb = RotaryEmbedding( self.hidden_size // (self.num_attention_heads * 2) if position_encoding_2d else self.hidden_size // self.num_attention_heads, base=10000, precision=torch.half, learnable=False, ) self.scale_mask_softmax = None if hidden_size_per_attention_head is None: self.hidden_size_per_attention_head = hidden_size // num_attention_heads else: self.hidden_size_per_attention_head = hidden_size_per_attention_head self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head # Strided linear layer. # self.query_key_value = skip_init( self.query_key_value = torch.nn.Linear( hidden_size, 3 * self.inner_hidden_size, bias=bias, dtype=params_dtype, ) # self.dense = skip_init( self.dense = torch.nn.Linear( self.inner_hidden_size, hidden_size, bias=bias, dtype=params_dtype, ) @staticmethod def attention_mask_func(attention_scores, attention_mask): attention_scores.masked_fill_(attention_mask, -10000.0) return attention_scores def split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_chunks=False): """Split a tensor along its last dimension. Arguments: tensor: input tensor. num_partitions: number of partitions to split the tensor contiguous_split_chunks: If True, make each chunk contiguous in memory. """ # Get the size and dimension. last_dim = tensor.dim() - 1 last_dim_size = tensor.size()[last_dim] // num_partitions # Split. tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) # Note: torch.split does not create contiguous tensors by default. if contiguous_split_chunks: return tuple(chunk.contiguous() for chunk in tensor_list) return tensor_list def forward( self, hidden_states: torch.Tensor, position_ids, attention_mask: torch.Tensor, layer_id, layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, use_cache: bool = False, output_attentions: bool = False, ): """ hidden_states: [seq_len, batch, hidden_size] attention_mask: [(1, 1), seq_len, seq_len] """ # [seq_len, batch, 3 * hidden_size] mixed_raw_layer = self.query_key_value(hidden_states) # [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 * hidden_size_per_attention_head] new_tensor_shape = mixed_raw_layer.size()[:-1] + ( self.num_attention_heads_per_partition, 3 * self.hidden_size_per_attention_head, ) mixed_raw_layer = mixed_raw_layer.view(*new_tensor_shape) # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head] (query_layer, key_layer, value_layer) = self.split_tensor_along_last_dim(mixed_raw_layer, 3) if self.position_encoding_2d: q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1)) k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1)) cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1) position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \ position_ids[:, 1, :].transpose(0, 1).contiguous() q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids) q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids) query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1)) key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1)) else: position_ids = position_ids.transpose(0, 1) cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1) # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head] query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids) # [seq_len, batch, hidden_size] context_layer, present, attention_probs = attention_fn( self=self, query_layer=query_layer, key_layer=key_layer, value_layer=value_layer, attention_mask=attention_mask, hidden_size_per_partition=self.hidden_size_per_partition, layer_id=layer_id, layer_past=layer_past, use_cache=use_cache ) output = self.dense(context_layer) outputs = (output, present) if output_attentions: outputs += (attention_probs,) return outputs # output, present, attention_probs class GEGLU(torch.nn.Module): def __init__(self): super().__init__() self.activation_fn = F.gelu def forward(self, x): # dim=-1 breaks in jit for pt<1.10 x1, x2 = x.chunk(2, dim=(x.ndim - 1)) return x1 * self.activation_fn(x2) class GLU(torch.nn.Module): def __init__(self, hidden_size, inner_hidden_size=None, layer_id=None, bias=True, activation_func=gelu, params_dtype=torch.float): super(GLU, self).__init__() self.layer_id = layer_id self.activation_func = activation_func # Project to 4h. self.hidden_size = hidden_size if inner_hidden_size is None: inner_hidden_size = 4 * hidden_size self.inner_hidden_size = inner_hidden_size # self.dense_h_to_4h = skip_init( self.dense_h_to_4h = torch.nn.Linear( self.hidden_size, self.inner_hidden_size, bias=bias, dtype=params_dtype, ) # Project back to h. # self.dense_4h_to_h = skip_init( self.dense_4h_to_h = torch.nn.Linear( self.inner_hidden_size, self.hidden_size, bias=bias, dtype=params_dtype, ) def forward(self, hidden_states): """ hidden_states: [seq_len, batch, hidden_size] """ # [seq_len, batch, inner_hidden_size] intermediate_parallel = self.dense_h_to_4h(hidden_states) intermediate_parallel = self.activation_func(intermediate_parallel) output = self.dense_4h_to_h(intermediate_parallel) return output class GLMBlock(torch.nn.Module): def __init__( self, hidden_size, num_attention_heads, layernorm_epsilon, layer_id, inner_hidden_size=None, hidden_size_per_attention_head=None, layernorm=LayerNorm, use_bias=True, params_dtype=torch.float, num_layers=28, position_encoding_2d=True ): super(GLMBlock, self).__init__() # Set output layer initialization if not provided. self.layer_id = layer_id # Layernorm on the input data. self.input_layernorm = layernorm(hidden_size, eps=layernorm_epsilon) self.position_encoding_2d = position_encoding_2d # Self attention. self.attention = SelfAttention( hidden_size, num_attention_heads, layer_id, hidden_size_per_attention_head=hidden_size_per_attention_head, bias=use_bias, params_dtype=params_dtype, position_encoding_2d=self.position_encoding_2d ) # Layernorm on the input data. self.post_attention_layernorm = layernorm(hidden_size, eps=layernorm_epsilon) self.num_layers = num_layers # GLU self.mlp = GLU( hidden_size, inner_hidden_size=inner_hidden_size, bias=use_bias, layer_id=layer_id, params_dtype=params_dtype, ) def forward( self, hidden_states: torch.Tensor, position_ids, attention_mask: torch.Tensor, layer_id, layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, use_cache: bool = False, output_attentions: bool = False, ): """ hidden_states: [seq_len, batch, hidden_size] attention_mask: [(1, 1), seq_len, seq_len] """ # Layer norm at the begining of the transformer layer. # [seq_len, batch, hidden_size] attention_input = self.input_layernorm(hidden_states) # Self attention. attention_outputs = self.attention( attention_input, position_ids, attention_mask=attention_mask, layer_id=layer_id, layer_past=layer_past, use_cache=use_cache, output_attentions=output_attentions ) attention_output = attention_outputs[0] outputs = attention_outputs[1:] # Residual connection. alpha = (2 * self.num_layers) ** 0.5 hidden_states = attention_input * alpha + attention_output mlp_input = self.post_attention_layernorm(hidden_states) # MLP. mlp_output = self.mlp(mlp_input) # Second residual connection. output = mlp_input * alpha + mlp_output if use_cache: outputs = (output,) + outputs else: outputs = (output,) + outputs[1:] return outputs # hidden_states, present, attentions class ChatGLMPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ is_parallelizable = False supports_gradient_checkpointing = True config_class = ChatGLMConfig base_model_prefix = "transformer" _no_split_modules = ["GLMBlock"] def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) def _init_weights(self, module: nn.Module): """Initialize the weights.""" return def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, ChatGLMModel): module.gradient_checkpointing = value CHATGLM_6B_START_DOCSTRING = r""" This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ CHATGLM_6B_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`ChatGLM6BTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`: - 0 corresponds to a *sentence A* token, - 1 corresponds to a *sentence B* token. [What are token type IDs?](../glossary#token-type-ids) position_ids (`torch.LongTensor` of shape `({0})`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. [What are position IDs?](../glossary#position-ids) head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @add_start_docstrings( "The bare ChatGLM-6B Model transformer outputting raw hidden-states without any specific head on top.", CHATGLM_6B_START_DOCSTRING, ) class ChatGLMModel(ChatGLMPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in [Attention is all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ def __init__(self, config: ChatGLMConfig): super().__init__(config) # recording parameters self.max_sequence_length = config.max_sequence_length self.hidden_size = config.hidden_size self.params_dtype = torch.half self.num_attention_heads = config.num_attention_heads self.vocab_size = config.vocab_size self.num_layers = config.num_layers self.layernorm_epsilon = config.layernorm_epsilon self.inner_hidden_size = config.inner_hidden_size self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads self.position_encoding_2d = config.position_encoding_2d self.pre_seq_len = config.pre_seq_len self.prefix_projection = config.prefix_projection # self.word_embeddings = skip_init( self.word_embeddings = torch.nn.Embedding( num_embeddings=self.vocab_size, embedding_dim=self.hidden_size, dtype=self.params_dtype ) self.gradient_checkpointing = False def get_layer(layer_id): return GLMBlock( self.hidden_size, self.num_attention_heads, self.layernorm_epsilon, layer_id, inner_hidden_size=self.inner_hidden_size, hidden_size_per_attention_head=self.hidden_size_per_attention_head, layernorm=LayerNorm, use_bias=True, params_dtype=self.params_dtype, position_encoding_2d=self.position_encoding_2d, ) self.layers = torch.nn.ModuleList( [get_layer(layer_id) for layer_id in range(self.num_layers)] ) # Final layer norm before output. self.final_layernorm = LayerNorm(self.hidden_size, eps=self.layernorm_epsilon) if self.pre_seq_len is not None: for param in self.parameters(): param.requires_grad = False self.prefix_tokens = torch.arange(self.pre_seq_len).long() self.prefix_encoder = PrefixEncoder(config) self.dropout = torch.nn.Dropout(0.1) # total_params = sum(p.numel() for p in self.parameters()) # trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad) # print("Using p-tuning v2: # trainable_params = {} / {}".format(trainable_params, total_params)) def get_input_embeddings(self): return self.word_embeddings def set_input_embeddings(self, new_embeddings: torch.Tensor): self.word_embeddings = new_embeddings def get_prompt(self, batch_size, device, dtype=torch.half): prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device) past_key_values = self.prefix_encoder(prefix_tokens).type(dtype) past_key_values = past_key_values.view( batch_size, self.pre_seq_len, self.num_layers * 2, self.num_attention_heads, self.hidden_size // self.num_attention_heads ) # seq_len, b, nh, hidden_size past_key_values = self.dropout(past_key_values) past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2) # past_key_values = [(v[0], v[1]) for v in past_key_values] return past_key_values def get_masks(self, input_ids, device): batch_size, seq_length = input_ids.shape context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids] attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device) attention_mask.tril_() for i, context_length in enumerate(context_lengths): attention_mask[i, :, :context_length] = 1 attention_mask.unsqueeze_(1) attention_mask = (attention_mask < 0.5).bool() return attention_mask def get_position_ids(self, input_ids, mask_positions, device, gmask=False): batch_size, seq_length = input_ids.shape context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids] if self.position_encoding_2d: position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length) if not gmask: for i, context_length in enumerate(context_lengths): position_ids[i, context_length:] = mask_positions[i] block_position_ids = [torch.cat(( torch.zeros(context_length, dtype=torch.long, device=device), torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1 )) for context_length in context_lengths] block_position_ids = torch.stack(block_position_ids, dim=0) position_ids = torch.stack((position_ids, block_position_ids), dim=1) else: position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length) if not gmask: for i, context_length in enumerate(context_lengths): position_ids[context_length:] = mask_positions[i] return position_ids @add_start_docstrings_to_model_forward(CHATGLM_6B_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, inputs_embeds: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.gradient_checkpointing and self.training: if use_cache: # logger.warning_once( # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." # ) use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: batch_size, seq_length = input_ids.shape[:2] elif inputs_embeds is not None: batch_size, seq_length, _ = inputs_embeds.shape[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) if past_key_values is None: if self.pre_seq_len is not None: past_key_values = self.get_prompt(batch_size=input_ids.shape[0], device=input_ids.device, dtype=inputs_embeds.dtype) else: past_key_values = tuple([None] * len(self.layers)) if attention_mask is None: attention_mask = self.get_masks( input_ids, device=input_ids.device ) if self.pre_seq_len is not None: prefix_attention_mask = torch.ones(batch_size, 1, input_ids.size(-1), self.pre_seq_len).to( attention_mask.device) prefix_attention_mask = (prefix_attention_mask < 0.5).bool() attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=3) if position_ids is None: MASK, gMASK = 150000, 150001 mask_token = MASK if MASK in input_ids else gMASK use_gmask = False if MASK in input_ids else gMASK mask_positions = [seq.tolist().index(mask_token) for seq in input_ids] position_ids = self.get_position_ids( input_ids, mask_positions=mask_positions, device=input_ids.device, gmask=use_gmask ) # [seq_len, batch, hidden_size] hidden_states = inputs_embeds.transpose(0, 1) presents = () if use_cache else None all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None if attention_mask is None: attention_mask = torch.zeros(1, 1, device=input_ids.device).bool() else: attention_mask = attention_mask.to(input_ids.device) for i, layer in enumerate(self.layers): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_past = past_key_values[i] if self.gradient_checkpointing and self.training: layer_ret = torch.utils.checkpoint.checkpoint( layer, hidden_states, position_ids, attention_mask, torch.tensor(i), layer_past, use_cache, output_attentions ) else: layer_ret = layer( hidden_states, position_ids=position_ids, attention_mask=attention_mask, layer_id=torch.tensor(i), layer_past=layer_past, use_cache=use_cache, output_attentions=output_attentions ) hidden_states = layer_ret[0] if use_cache: presents = presents + (layer_ret[1],) if output_attentions: all_self_attentions = all_self_attentions + (layer_ret[2 if use_cache else 1],) # Final layer norm. hidden_states = self.final_layernorm(hidden_states) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, ) class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): def __init__(self, config: ChatGLMConfig): super().__init__(config) # self.hidden_size = config.hidden_size # self.params_dtype = torch.half # self.vocab_size = config.vocab_size self.max_sequence_length = config.max_sequence_length self.position_encoding_2d = config.position_encoding_2d self.transformer = ChatGLMModel(config) # self.lm_head = skip_init( self.lm_head = nn.Linear( config.hidden_size, config.vocab_size, bias=False, dtype=torch.half ) self.config = config self.quantized = False if self.config.quantization_bit: self.quantize(self.config.quantization_bit, empty_init=True) def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def get_masks_and_position_ids(self, input_ids, mask_positions, device, gmask=False): batch_size, seq_length = input_ids.shape context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids] attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device) attention_mask.tril_() for i, context_length in enumerate(context_lengths): attention_mask[i, :, :context_length] = 1 attention_mask.unsqueeze_(1) attention_mask = (attention_mask < 0.5).bool() batch_size, seq_length = input_ids.shape context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids] if self.position_encoding_2d: position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length) if not gmask: for i, context_length in enumerate(context_lengths): position_ids[i, context_length:] = mask_positions[i] block_position_ids = [torch.cat(( torch.zeros(context_length, dtype=torch.long, device=device), torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1 )) for context_length in context_lengths] block_position_ids = torch.stack(block_position_ids, dim=0) position_ids = torch.stack((position_ids, block_position_ids), dim=1) else: position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length) if not gmask: for i, context_length in enumerate(context_lengths): position_ids[context_length:] = mask_positions[i] return attention_mask, position_ids def prepare_inputs_for_generation( self, input_ids: torch.LongTensor, past: Optional[torch.Tensor] = None, past_key_values: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs ) -> dict: batch_size, seq_length = input_ids.shape MASK, gMASK = 150000, 150001 mask_token = MASK if MASK in input_ids else gMASK use_gmask = False if MASK in input_ids else gMASK seqs = input_ids.tolist() mask_positions = [seq.index(mask_token) for seq in seqs] # only last token for input_ids if past is not None if past is not None or past_key_values is not None: context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs] last_token = input_ids[:, -1].unsqueeze(-1) if self.position_encoding_2d: position_ids = torch.tensor( [[mask_position, seq_length - context_length] for mask_position, context_length in zip(mask_positions, context_lengths)], dtype=torch.long, device=input_ids.device).unsqueeze(-1) else: position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long, device=input_ids.device).unsqueeze(-1) if past is None: past = past_key_values return { "input_ids": last_token, "past_key_values": past, "position_ids": position_ids, } else: attention_mask, position_ids = self.get_masks_and_position_ids( input_ids, mask_positions=mask_positions, device=input_ids.device, gmask=use_gmask ) return { "input_ids": input_ids, "past_key_values": past, "position_ids": position_ids, "attention_mask": attention_mask } def forward( self, input_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states).permute(1, 0, 2).contiguous() loss = None if labels is not None: lm_logits = lm_logits.to(torch.float32) # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=self.config.pad_token_id) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) lm_logits = lm_logits.to(hidden_states.dtype) loss = loss.to(hidden_states.dtype) if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) @staticmethod def _reorder_cache( past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: """ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct beam_idx at every generation step. Output shares the same memory storage as `past`. """ return tuple( ( layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)), layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)), ) for layer_past in past ) def process_response(self, response): response = response.strip() response = response.replace("[[训练时间]]", "2023年") punkts = [ [",", ","], ["!", "!"], [":", ":"], [";", ";"], ["\?", "?"], ] for item in punkts: response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) return response @torch.no_grad() def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1, do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs): if history is None: history = [] if logits_processor is None: logits_processor = LogitsProcessorList() logits_processor.append(InvalidScoreLogitsProcessor()) gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "temperature": temperature, "logits_processor": logits_processor, **kwargs} if not history: prompt = query else: prompt = "" for i, (old_query, response) in enumerate(history): prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) prompt += "[Round {}]\n问:{}\n答:".format(len(history), query) input_ids = tokenizer([prompt], return_tensors="pt", padding=True) input_ids = input_ids.to(self.device) outputs = self.generate(**input_ids, **gen_kwargs) outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]):] response = tokenizer.decode(outputs) response = self.process_response(response) history = history + [(query, response)] return response, history @torch.no_grad() def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs): if history is None: history = [] if logits_processor is None: logits_processor = LogitsProcessorList() logits_processor.append(InvalidScoreLogitsProcessor()) gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, "temperature": temperature, "logits_processor": logits_processor, **kwargs} if not history: prompt = query else: prompt = "" for i, (old_query, response) in enumerate(history): prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) prompt += "[Round {}]\n问:{}\n答:".format(len(history), query) input_ids = tokenizer([prompt], return_tensors="pt", padding=True) input_ids = input_ids.to(self.device) for outputs in self.stream_generate(**input_ids, **gen_kwargs): outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]):] response = tokenizer.decode(outputs) response = self.process_response(response) new_history = history + [(query, response)] yield response, new_history @torch.no_grad() def stream_generate( self, input_ids, generation_config: Optional[GenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, **kwargs, ): batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] if generation_config is None: generation_config = self.generation_config generation_config = copy.deepcopy(generation_config) model_kwargs = generation_config.update(**kwargs) bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None if has_default_max_length and generation_config.max_new_tokens is None: warnings.warn( f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" " recommend using `max_new_tokens` to control the maximum length of the generation.", UserWarning, ) elif generation_config.max_new_tokens is not None: generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length if not has_default_max_length: logger.warn( f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " "Please refer to the documentation for more information. " "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", UserWarning, ) if input_ids_seq_length >= generation_config.max_length: input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" logger.warning( f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" " increasing `max_new_tokens`." ) # 2. Set generation parameters if not already defined logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() logits_processor = self._get_logits_processor( generation_config=generation_config, input_ids_seq_length=input_ids_seq_length, encoder_input_ids=input_ids, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, logits_processor=logits_processor, ) stopping_criteria = self._get_stopping_criteria( generation_config=generation_config, stopping_criteria=stopping_criteria ) logits_warper = self._get_logits_warper(generation_config) unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) scores = None while True: model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) # forward pass to get next token outputs = self( **model_inputs, return_dict=True, output_attentions=False, output_hidden_states=False, ) next_token_logits = outputs.logits[:, -1, :] # pre-process distribution next_token_scores = logits_processor(input_ids, next_token_logits) next_token_scores = logits_warper(input_ids, next_token_scores) # sample probs = nn.functional.softmax(next_token_scores, dim=-1) if generation_config.do_sample: next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) else: next_tokens = torch.argmax(probs, dim=-1) # update generated ids, model inputs, and length for next step input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) model_kwargs = self._update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder ) unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long()) # stop when each sentence is finished, or if we exceed the maximum length if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): break yield input_ids def quantize(self, bits: int, empty_init=False, **kwargs): if bits == 0: return from .quantization import quantize if self.quantized: logger.info("Already quantized.") return self self.quantized = True self.config.quantization_bit = bits self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs) return self ================================================ FILE: src/resources/models/chatglm-6B/quantization.py ================================================ from torch.nn import Linear from torch.nn.parameter import Parameter import bz2 import torch import base64 import ctypes from transformers.utils import logging from typing import List from functools import partial logger = logging.get_logger(__name__) try: from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up class Kernel: def __init__(self, code: bytes, function_names: List[str]): self.code = code self._function_names = function_names self._cmodule = LazyKernelCModule(self.code) for name in self._function_names: setattr(self, name, KernelFunction(self._cmodule, name)) quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ" kernels = Kernel( bz2.decompress(base64.b64decode(quantization_code)), [ "int4WeightCompression", "int4WeightExtractionFloat", "int4WeightExtractionHalf", "int8WeightExtractionFloat", "int8WeightExtractionHalf", ], ) except Exception as exception: kernels = None logger.warning("Failed to load cpm_kernels:" + str(exception)) class W8A16Linear(torch.autograd.Function): @staticmethod def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): ctx.inp_shape = inp.size() ctx.weight_bit_width = weight_bit_width out_features = quant_w.size(0) inp = inp.contiguous().view(-1, inp.size(-1)) weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) ctx.weight_shape = weight.size() output = inp.mm(weight.t()) ctx.save_for_backward(inp, quant_w, scale_w) return output.view(*(ctx.inp_shape[:-1] + (out_features,))) @staticmethod def backward(ctx, grad_output: torch.Tensor): inp, quant_w, scale_w = ctx.saved_tensors weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) grad_output = grad_output.contiguous().view(-1, weight.size(0)) grad_input = grad_output.mm(weight) grad_weight = grad_output.t().mm(inp) return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None def compress_int4_weight(weight: torch.Tensor): # (n, m) with torch.cuda.device(weight.device): n, m = weight.size(0), weight.size(1) assert m % 2 == 0 m = m // 2 out = torch.empty(n, m, dtype=torch.int8, device="cuda") stream = torch.cuda.current_stream() gridDim = (n, 1, 1) blockDim = (min(round_up(m, 32), 1024), 1, 1) kernels.int4WeightCompression( gridDim, blockDim, 0, stream, [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], ) return out def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): if source_bit_width == 8: func = kernels.int8WeightExtractionHalf elif source_bit_width == 4: func = kernels.int4WeightExtractionHalf else: assert False, "Unsupported bit-width" with torch.cuda.device(weight.device): n, m = weight.size(0), weight.size(1) out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda") stream = torch.cuda.current_stream() gridDim = (n, 1, 1) blockDim = (min(round_up(m, 32), 1024), 1, 1) func( gridDim, blockDim, 0, stream, [ ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(scale_list.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m), ], ) return out class QuantizedLinear(Linear): def __init__(self, weight_bit_width: int, weight_tensor=None, bias_tensor=None, empty_init=False, *args, **kwargs): super(QuantizedLinear, self).__init__(*args, **kwargs) self.weight_bit_width = weight_bit_width shape = self.weight.shape del self.weight if weight_tensor is None or empty_init: self.weight = torch.empty( shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"] ) self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"]) else: self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half() self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8) if weight_bit_width == 4: self.weight = compress_int4_weight(self.weight) self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False) self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False) if bias_tensor is not None: self.bias = Parameter(bias_tensor.to(kwargs["device"]), requires_grad=False) else: self.bias = None def forward(self, input): output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width) if self.bias is not None: output = output + self.bias return output def quantize(model, weight_bit_width, empty_init=False, **kwargs): """Replace fp16 linear with quantized linear""" for layer in model.layers: layer.attention.query_key_value = QuantizedLinear( weight_bit_width=weight_bit_width, weight_tensor=layer.attention.query_key_value.weight.to(torch.cuda.current_device()), bias_tensor=layer.attention.query_key_value.bias, in_features=layer.attention.query_key_value.in_features, out_features=layer.attention.query_key_value.out_features, bias=True, dtype=torch.half, device=layer.attention.query_key_value.weight.device, empty_init=empty_init ) layer.attention.dense = QuantizedLinear( weight_bit_width=weight_bit_width, weight_tensor=layer.attention.dense.weight.to(torch.cuda.current_device()), bias_tensor=layer.attention.dense.bias, in_features=layer.attention.dense.in_features, out_features=layer.attention.dense.out_features, bias=True, dtype=torch.half, device=layer.attention.dense.weight.device, empty_init=empty_init ) layer.mlp.dense_h_to_4h = QuantizedLinear( weight_bit_width=weight_bit_width, weight_tensor=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), bias_tensor=layer.mlp.dense_h_to_4h.bias, in_features=layer.mlp.dense_h_to_4h.in_features, out_features=layer.mlp.dense_h_to_4h.out_features, bias=True, dtype=torch.half, device=layer.mlp.dense_h_to_4h.weight.device, empty_init=empty_init ) layer.mlp.dense_4h_to_h = QuantizedLinear( weight_bit_width=weight_bit_width, weight_tensor=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), bias_tensor=layer.mlp.dense_4h_to_h.bias, in_features=layer.mlp.dense_4h_to_h.in_features, out_features=layer.mlp.dense_4h_to_h.out_features, bias=True, dtype=torch.half, device=layer.mlp.dense_4h_to_h.weight.device, empty_init=empty_init ) return model ================================================ FILE: src/resources/models/chatglm-6B/tokenization_chatglm.py ================================================ """Tokenization classes for ChatGLM.""" import sys import unicodedata from typing import List, Optional, Union from functools import lru_cache import os import collections import re from transformers.tokenization_utils import PreTrainedTokenizer from icetk.text_tokenizer import TextTokenizer from icetk.utils import auto_create import icetk.sentencepiece_model_pb2 as sp_model from transformers.utils import logging logger = logging.get_logger(__name__) PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "THUDM/chatglm-6b": 2048, } class SPTokenizer: def __init__( self, vocab_file, max_blank_length=80, byte_fallback=True, ): assert vocab_file is not None self.vocab_file = vocab_file self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "", "", "", "", ""] self.max_blank_length = max_blank_length self.byte_fallback = byte_fallback self.text_tokenizer = self._build_text_tokenizer(encode_special_tokens=False) self.special_text_tokenizer = self._build_text_tokenizer(encode_special_tokens=True) @staticmethod def _configure_tokenizer( text_tokenizer: TextTokenizer, special_tokens: List[str], max_blank_length: int, byte_fallback: bool, encode_special_tokens=False, ): # special token special_token_type = 4 if encode_special_tokens else 3 # 3 - CONTROL, 4 - USER_DEFINE for token in special_tokens: text_tokenizer.proto.pieces.append( sp_model.ModelProto.SentencePiece(piece=token, score=0.0, type=special_token_type) ) # whitespaces for token in [SPTokenizer.get_tab_token()] + [ SPTokenizer.get_blank_token(i) for i in range(2, max_blank_length + 1) ]: text_tokenizer.proto.pieces.append(sp_model.ModelProto.SentencePiece(piece=token, score=0.0, type=4)) # byte fallback if byte_fallback: text_tokenizer.proto.trainer_spec.byte_fallback = True for i in range(256): text_tokenizer.proto.pieces.append( sp_model.ModelProto.SentencePiece(piece="<0x{:02X}>".format(i), score=0.0, type=6) ) text_tokenizer.refresh() def _build_text_tokenizer(self, encode_special_tokens=False): tokenizer = TextTokenizer(self.vocab_file) self._configure_tokenizer( tokenizer, self.special_tokens, self.max_blank_length, self.byte_fallback, encode_special_tokens ) return tokenizer def _get_text_tokenizer(self, encode_special_tokens=False): if encode_special_tokens: return self.special_text_tokenizer else: return self.text_tokenizer @staticmethod def get_blank_token(length: int): assert length >= 2 return f"<|blank_{length}|>" @staticmethod def get_tab_token(): return f"<|tab|>" @property def num_image_tokens(self): return 20000 @property def num_text_tokens(self): return self.text_tokenizer.num_tokens @property def num_tokens(self): return self.num_image_tokens + self.num_text_tokens @staticmethod def _encode_whitespaces(text: str, max_len: int = 80): text = text.replace("\t", SPTokenizer.get_tab_token()) for i in range(max_len, 1, -1): text = text.replace(" " * i, SPTokenizer.get_blank_token(i)) return text def _preprocess(self, text: str, linebreak=True, whitespaces=True): if linebreak: text = text.replace("\n", "") if whitespaces: text = self._encode_whitespaces(text, max_len=self.max_blank_length) return text def encode( self, text: str, linebreak=True, whitespaces=True, special_tokens=False, add_dummy_prefix=True ) -> List[int]: """ @param text: Text to encode. @param linebreak: Whether to encode newline (\n) in text. @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. @param add_dummy_prefix: Whether to add dummy blank space in the beginning. """ text = self._preprocess(text, linebreak, whitespaces) if not add_dummy_prefix: text = "" + text tmp = self._get_text_tokenizer(encode_special_tokens=special_tokens).encode(text) tokens = [x + self.num_image_tokens for x in tmp] return tokens if add_dummy_prefix else tokens[2:] def decode(self, text_ids: List[int], special_tokens=False) -> str: ids = [int(_id) - self.num_image_tokens for _id in text_ids] text_tokenizer = self._get_text_tokenizer(encode_special_tokens=special_tokens) if special_tokens: text = "".join([text_tokenizer.convert_id_to_token(_id) for _id in ids if _id >= 0]) else: ids = [_id for _id in ids if _id >= 0] text = text_tokenizer.decode(ids) text = text.replace("", "\n") text = text.replace(SPTokenizer.get_tab_token(), "\t") for i in range(2, self.max_blank_length + 1): text = text.replace(self.get_blank_token(i), " " * i) return text def tokenize( self, text: str, linebreak=True, whitespaces=True, special_tokens=False, add_dummy_prefix=True ) -> List[str]: """ @param text: Text to encode. @param linebreak: Whether to encode newline (\n) in text. @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. @param add_dummy_prefix: Whether to add dummy blank space in the beginning. """ text = self._preprocess(text, linebreak, whitespaces) if not add_dummy_prefix: text = "" + text tokens = self._get_text_tokenizer(encode_special_tokens=special_tokens).tokenize(text) return tokens if add_dummy_prefix else tokens[2:] def __getitem__(self, x: Union[int, str]): if isinstance(x, int): if x < self.num_image_tokens: return "".format(x) else: return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens) elif isinstance(x, str): if x.startswith("") and x[7:-1].isdigit(): return int(x[7:-1]) else: return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens else: raise ValueError("The key should be str or int.") class ChatGLMTokenizer(PreTrainedTokenizer): """ Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding. Args: vocab_file (`str`): Path to the vocabulary file. """ vocab_files_names = {"vocab_file": "ice_text.model"} max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids"] def __init__( self, vocab_file, do_lower_case=False, remove_space=False, bos_token='sop', eos_token='eos', eop_token='eop', mask_token='[MASK]', gmask_token='[gMASK]', padding_side="right", **kwargs ) -> None: super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, padding_side=padding_side, **kwargs ) self.do_lower_case = do_lower_case self.remove_space = remove_space self.vocab_file = vocab_file self.bos_token = bos_token self.eos_token = eos_token self.eop_token = eop_token self.mask_token = mask_token self.gmask_token = gmask_token self.sp_tokenizer = SPTokenizer(vocab_file) """ Initialisation """ @property def eop_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been set. """ if self.eop_token is None: return None return self.convert_tokens_to_ids(self.eop_token) @property def gmask_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been set. """ if self.gmask_token is None: return None return self.convert_tokens_to_ids(self.gmask_token) @property def vocab_size(self): """ Returns vocab size """ return self.sp_tokenizer.num_tokens def get_vocab(self): """ Returns vocab as a dict """ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def preprocess_text(self, inputs): if self.remove_space: outputs = " ".join(inputs.strip().split()) else: outputs = inputs if self.do_lower_case: outputs = outputs.lower() return outputs def _tokenize(self, text, **kwargs): """ Returns a tokenized string. """ text = self.preprocess_text(text) seq = self.sp_tokenizer.tokenize(text) return seq def decode( self, token_ids: Union[List[int], List[List[int]]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, spaces_between_special_tokens: bool = True, **kwargs ) -> str: if isinstance(token_ids[0], list): tokens = [] for single_token_ids in token_ids: if skip_special_tokens and self.pad_token_id in single_token_ids: # remove pad single_token_ids = list(filter((self.pad_token_id).__ne__, single_token_ids)) tokens.append(self.sp_tokenizer.decode(single_token_ids, not skip_special_tokens)) return (tokens) else: if skip_special_tokens and self.pad_token_id in token_ids: # remove pad token_ids = list(filter((self.pad_token_id).__ne__, token_ids)) return self.sp_tokenizer.decode(token_ids, not skip_special_tokens) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.sp_tokenizer[token] def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.sp_tokenizer[index] def save_vocabulary(self, save_directory, filename_prefix=None): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. filename_prefix (`str`, *optional*): An optional prefix to add to the named of the saved files. Returns: `Tuple(str)`: Paths to the files saved. """ if os.path.isdir(save_directory): vocab_file = os.path.join( save_directory, self.vocab_files_names["vocab_file"] ) else: vocab_file = save_directory with open(self.vocab_file, 'rb') as fin: proto_str = fin.read() with open(vocab_file, "wb") as writer: writer.write(proto_str) return (vocab_file,) # def num_special_tokens_to_add(self, pair: bool = False) -> int: # """ # Returns the number of added tokens when encoding a sequence with special tokens. # # # # This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put # this inside your training loop. # # # # Args: # pair (`bool`, *optional*, defaults to `False`): # Whether the number of added tokens should be computed in the case of a sequence pair or a single # sequence. # # Returns: # `int`: Number of special tokens added to sequences. # """ # # token_ids_0 = [] # # token_ids_1 = [] # # return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) # return 2 def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: `[CLS] X [SEP]` - pair of sequences: `[CLS] A [SEP] B [SEP]` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ mask_id = self.sp_tokenizer[self.mask_token] gmask_id = self.sp_tokenizer[self.gmask_token] eos_id = self.sp_tokenizer[self.eos_token] bos_id = self.sp_tokenizer[self.bos_token] eop_id = self.sp_tokenizer[self.eop_token] if mask_id not in token_ids_0 and gmask_id not in token_ids_0: token_ids_0 += [gmask_id] if token_ids_0[-1] != mask_id and token_ids_0[-1] != gmask_id: token_ids_0 += [eos_id] token_ids_0 += [bos_id] if token_ids_1 is not None: token_ids_0 += token_ids_1 + [eop_id] return token_ids_0 ================================================ FILE: src/resources/models/chatglm-6B/tokenizer_config.json ================================================ { "name_or_path": "THUDM/chatglm-6b", "bos_token": "", "eop_token": "", "eos_token": "", "gmask_token": "[gMASK]", "mask_token": "[MASK]", "pad_token": "", "unk_token": "", "remove_space": false, "do_lower_case": false, "tokenizer_class": "ChatGLMTokenizer", "auto_map": { "AutoTokenizer": [ "tokenization_chatglm.ChatGLMTokenizer", null ] } } ================================================ FILE: src/resources/models/chatglm2-6B/config.json ================================================ { "model_type": "chatglm", "architectures": [ "ChatGLMModel" ], "auto_map": { "AutoConfig": "configuration_chatglm.ChatGLMConfig", "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration" }, "add_bias_linear": false, "add_qkv_bias": true, "apply_query_key_layer_scaling": true, "apply_residual_connection_post_layernorm": false, "attention_dropout": 0.0, "attention_softmax_in_fp32": true, "bias_dropout_fusion": true, "ffn_hidden_size": 13696, "fp32_residual_connection": false, "hidden_dropout": 0.0, "hidden_size": 4096, "kv_channels": 128, "layernorm_epsilon": 1e-05, "multi_query_attention": true, "multi_query_group_num": 2, "num_attention_heads": 32, "num_layers": 28, "original_rope": true, "padded_vocab_size": 65024, "post_layer_norm": true, "rmsnorm": true, "seq_length": 32768, "use_cache": true, "torch_dtype": "float16", "transformers_version": "4.27.1", "tie_word_embeddings": false, "eos_token_id": 2, "pad_token_id": 2 } ================================================ FILE: src/resources/models/chatglm2-6B/configuration_chatglm.py ================================================ from transformers import PretrainedConfig class ChatGLMConfig(PretrainedConfig): def __init__( self, num_layers=28, padded_vocab_size=65024, hidden_size=4096, ffn_hidden_size=13696, kv_channels=128, num_attention_heads=32, seq_length=2048, hidden_dropout=0.0, attention_dropout=0.0, layernorm_epsilon=1e-5, rmsnorm=True, apply_residual_connection_post_layernorm=False, post_layer_norm=True, add_bias_linear=False, add_qkv_bias=False, interleaved_qkv=False, bias_dropout_fusion=True, multi_query_attention=False, multi_query_group_num=1, apply_query_key_layer_scaling=True, attention_softmax_in_fp32=True, fp32_residual_connection=False, quantization_bit=0, **kwargs ): self.num_layers = num_layers self.padded_vocab_size = padded_vocab_size self.hidden_size = hidden_size self.ffn_hidden_size = ffn_hidden_size self.kv_channels = kv_channels self.num_attention_heads = num_attention_heads self.seq_length = seq_length self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.layernorm_epsilon = layernorm_epsilon self.rmsnorm = rmsnorm self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm self.post_layer_norm = post_layer_norm self.add_bias_linear = add_bias_linear self.add_qkv_bias = add_qkv_bias self.bias_dropout_fusion = bias_dropout_fusion self.multi_query_attention = multi_query_attention self.multi_query_group_num = multi_query_group_num self.apply_query_key_layer_scaling = apply_query_key_layer_scaling self.attention_softmax_in_fp32 = attention_softmax_in_fp32 self.fp32_residual_connection = fp32_residual_connection self.quantization_bit = quantization_bit super().__init__(**kwargs) ================================================ FILE: src/resources/models/chatglm2-6B/modeling_chatglm.py ================================================ """ PyTorch ChatGLM model. """ import math import copy import warnings import re import sys import torch import torch.utils.checkpoint import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss, LayerNorm from torch.nn.utils import skip_init from typing import Optional, Tuple, Union, List, Callable, Dict, Any from transformers.modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, ) from transformers.modeling_utils import PreTrainedModel from transformers.utils import logging from transformers.generation.logits_process import LogitsProcessor from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput from .configuration_chatglm import ChatGLMConfig # flags required to enable jit fusion kernels if sys.platform != 'darwin': torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) torch._C._jit_override_can_fuse_on_cpu(True) torch._C._jit_override_can_fuse_on_gpu(True) logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM2-6B" _CONFIG_FOR_DOC = "ChatGLM6BConfig" CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [ "THUDM/chatglm2-6b", # See all ChatGLM models at https://huggingface.co/models?filter=chatglm ] def default_init(cls, *args, **kwargs): return cls(*args, **kwargs) class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): scores.zero_() scores[..., 5] = 5e4 return scores def split_tensor_along_last_dim( tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False, ) -> List[torch.Tensor]: """Split a tensor along its last dimension. Arguments: tensor: input tensor. num_partitions: number of partitions to split the tensor contiguous_split_chunks: If True, make each chunk contiguous in memory. Returns: A list of Tensors """ # Get the size and dimension. last_dim = tensor.dim() - 1 last_dim_size = tensor.size()[last_dim] // num_partitions # Split. tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) # Note: torch.split does not create contiguous tensors by default. if contiguous_split_chunks: return tuple(chunk.contiguous() for chunk in tensor_list) return tensor_list class RotaryEmbedding(nn.Module): def __init__(self, dim, original_impl=False, device=None, dtype=None): super().__init__() inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device, dtype=dtype) / dim)) self.register_buffer("inv_freq", inv_freq) self.dim = dim self.original_impl = original_impl def forward_impl( self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000 ): """Enhanced Transformer with Rotary Position Embedding. Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ transformers/rope/__init__.py. MIT License: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. """ # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem)) # Create position indexes `[0, 1, ..., seq_len - 1]` seq_idx = torch.arange(seq_len, dtype=dtype, device=device) # Calculate the product of position index and $\theta_i$ idx_theta = torch.outer(seq_idx, theta).float() cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) # this is to mimic the behaviour of complex32, else we will get different results if dtype in (torch.float16, torch.bfloat16, torch.int8): cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() return cache def forward(self, max_seq_len, offset=0): return self.forward_impl( max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device ) @torch.jit.script def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: # x: [sq, b, np, hn] sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) rot_dim = rope_cache.shape[-2] * 2 x, x_pass = x[..., :rot_dim], x[..., rot_dim:] # truncate to support variable sizes rope_cache = rope_cache[:sq] xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2) rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2) x_out2 = torch.stack( [ xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], ], -1, ) x_out2 = x_out2.flatten(3) return torch.cat((x_out2, x_pass), dim=-1) class RMSNorm(torch.nn.Module): def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs): super().__init__() self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype)) self.eps = eps def forward(self, hidden_states: torch.Tensor): input_dtype = hidden_states.dtype variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.eps) return (self.weight * hidden_states).to(input_dtype) class CoreAttention(torch.nn.Module): def __init__(self, config: ChatGLMConfig, layer_number): super(CoreAttention, self).__init__() self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 if self.apply_query_key_layer_scaling: self.attention_softmax_in_fp32 = True self.layer_number = max(1, layer_number) projection_size = config.kv_channels * config.num_attention_heads # Per attention head and per partition values. self.hidden_size_per_partition = projection_size self.hidden_size_per_attention_head = projection_size // config.num_attention_heads self.num_attention_heads_per_partition = config.num_attention_heads coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) if self.apply_query_key_layer_scaling: coeff = self.layer_number self.norm_factor *= coeff self.coeff = coeff self.attention_dropout = torch.nn.Dropout(config.attention_dropout) def forward(self, query_layer, key_layer, value_layer, attention_mask): pytorch_major_version = int(torch.__version__.split('.')[0]) if pytorch_major_version >= 2: query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]: context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, is_causal=True) else: if attention_mask is not None and attention_mask.dtype == torch.bool: attention_mask = ~attention_mask context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, attention_mask) context_layer = context_layer.permute(2, 0, 1, 3) new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) context_layer = context_layer.reshape(*new_context_layer_shape) else: # Raw attention scores # [b, np, sq, sk] output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) # [sq, b, np, hn] -> [sq, b * np, hn] query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) # [sk, b, np, hn] -> [sk, b * np, hn] key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) # preallocting input tensor: [b * np, sq, sk] matmul_input_buffer = torch.empty( output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, device=query_layer.device ) # Raw attention scores. [b * np, sq, sk] matmul_result = torch.baddbmm( matmul_input_buffer, query_layer.transpose(0, 1), # [b * np, sq, hn] key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] beta=0.0, alpha=(1.0 / self.norm_factor), ) # change view to [b, np, sq, sk] attention_scores = matmul_result.view(*output_size) # =========================== # Attention probs and dropout # =========================== # attention scores and attention mask [b, np, sq, sk] if self.attention_softmax_in_fp32: attention_scores = attention_scores.float() if self.coeff is not None: attention_scores = attention_scores * self.coeff if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]: attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3], device=attention_scores.device, dtype=torch.bool) attention_mask.tril_() attention_mask = ~attention_mask if attention_mask is not None: if attention_mask.dtype == torch.bool: attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) else: attention_scores = attention_mask attention_probs = F.softmax(attention_scores, dim=-1) attention_probs = attention_probs.type_as(value_layer) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.attention_dropout(attention_probs) # ========================= # Context layer. [sq, b, hp] # ========================= # value_layer -> context layer. # [sk, b, np, hn] --> [b, np, sq, hn] # context layer shape: [b, np, sq, hn] output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) # change view [sk, b * np, hn] value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) # change view [b * np, sq, sk] attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) # matmul: [b * np, sq, hn] context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) # change view [b, np, sq, hn] context_layer = context_layer.view(*output_size) # [b, np, sq, hn] --> [sq, b, np, hn] context_layer = context_layer.permute(2, 0, 1, 3).contiguous() # [sq, b, np, hn] --> [sq, b, hp] new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) context_layer = context_layer.view(*new_context_layer_shape) return context_layer class SelfAttention(torch.nn.Module): """Parallel self-attention layer abstract class. Self-attention layer takes input with size [s, b, h] and returns output of the same size. """ def __init__(self, config: ChatGLMConfig, layer_number, device=None): super(SelfAttention, self).__init__() self.layer_number = max(1, layer_number) self.projection_size = config.kv_channels * config.num_attention_heads # Per attention head and per partition values. self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads self.num_attention_heads_per_partition = config.num_attention_heads self.multi_query_attention = config.multi_query_attention self.qkv_hidden_size = 3 * self.projection_size if self.multi_query_attention: self.num_multi_query_groups_per_partition = config.multi_query_group_num self.qkv_hidden_size = ( self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num ) self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size, bias=config.add_bias_linear or config.add_qkv_bias, device=device, **_config_to_kwargs(config) ) self.core_attention = CoreAttention(config, self.layer_number) # Output. self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear, device=device, **_config_to_kwargs(config) ) def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None): if self.multi_query_attention: num_attention_heads = self.num_multi_query_groups_per_partition else: num_attention_heads = self.num_attention_heads_per_partition return torch.empty( inference_max_sequence_len, batch_size, num_attention_heads, self.hidden_size_per_attention_head, dtype=dtype, device=device, ) def forward( self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True ): # hidden_states: [sq, b, h] # ================================================= # Pre-allocate memory for key-values for inference. # ================================================= # ===================== # Query, Key, and Value # ===================== # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] mixed_x_layer = self.query_key_value(hidden_states) if self.multi_query_attention: (query_layer, key_layer, value_layer) = mixed_x_layer.split( [ self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, ], dim=-1, ) query_layer = query_layer.view( query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) ) key_layer = key_layer.view( key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) ) value_layer = value_layer.view( value_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) ) else: new_tensor_shape = mixed_x_layer.size()[:-1] + \ (self.num_attention_heads_per_partition, 3 * self.hidden_size_per_attention_head) mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) # apply relative positional encoding (rotary embedding) if rotary_pos_emb is not None: query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) # adjust key and value for inference if use_cache: if kv_cache is not None: cache_k, cache_v = kv_cache key_layer = torch.cat((cache_k, key_layer), dim=0) value_layer = torch.cat((cache_v, value_layer), dim=0) kv_cache = (key_layer, value_layer) else: kv_cache = None if self.multi_query_attention: key_layer = key_layer.unsqueeze(-2) key_layer = key_layer.expand( -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 ) key_layer = key_layer.contiguous().view( key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) ) value_layer = value_layer.unsqueeze(-2) value_layer = value_layer.expand( -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 ) value_layer = value_layer.contiguous().view( value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) ) # ================================== # core attention computation # ================================== context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) # ================= # Output. [sq, b, h] # ================= output = self.dense(context_layer) return output, kv_cache def _config_to_kwargs(args): common_kwargs = { "dtype": args.torch_dtype, } return common_kwargs class MLP(torch.nn.Module): """MLP. MLP will take the input with h hidden state, project it to 4*h hidden dimension, perform nonlinear transformation, and project the state back into h hidden dimension. """ def __init__(self, config: ChatGLMConfig, device=None): super(MLP, self).__init__() self.add_bias = config.add_bias_linear # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf self.dense_h_to_4h = nn.Linear( config.hidden_size, config.ffn_hidden_size * 2, bias=self.add_bias, device=device, **_config_to_kwargs(config) ) def swiglu(x): x = torch.chunk(x, 2, dim=-1) return F.silu(x[0]) * x[1] self.activation_func = swiglu # Project back to h. self.dense_4h_to_h = nn.Linear( config.ffn_hidden_size, config.hidden_size, bias=self.add_bias, device=device, **_config_to_kwargs(config) ) def forward(self, hidden_states): # [s, b, 4hp] intermediate_parallel = self.dense_h_to_4h(hidden_states) intermediate_parallel = self.activation_func(intermediate_parallel) # [s, b, h] output = self.dense_4h_to_h(intermediate_parallel) return output class GLMBlock(torch.nn.Module): """A single transformer layer. Transformer layer takes input with size [s, b, h] and returns an output of the same size. """ def __init__(self, config: ChatGLMConfig, layer_number, device=None): super(GLMBlock, self).__init__() self.layer_number = layer_number self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm self.fp32_residual_connection = config.fp32_residual_connection LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm # Layernorm on the input data. self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype) # Self attention. self.self_attention = SelfAttention(config, layer_number, device=device) self.hidden_dropout = config.hidden_dropout # Layernorm on the attention output self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype) # MLP self.mlp = MLP(config, device=device) def forward( self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True, ): # hidden_states: [s, b, h] # Layer norm at the beginning of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) # Self attention. attention_output, kv_cache = self.self_attention( layernorm_output, attention_mask, rotary_pos_emb, kv_cache=kv_cache, use_cache=use_cache ) # Residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = hidden_states layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training) layernorm_input = residual + layernorm_input # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) # MLP. mlp_output = self.mlp(layernorm_output) # Second residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = layernorm_input output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training) output = residual + output return output, kv_cache class GLMTransformer(torch.nn.Module): """Transformer class.""" def __init__(self, config: ChatGLMConfig, device=None): super(GLMTransformer, self).__init__() self.fp32_residual_connection = config.fp32_residual_connection self.post_layer_norm = config.post_layer_norm # Number of layers. self.num_layers = config.num_layers # Transformer layers. def build_layer(layer_number): return GLMBlock(config, layer_number, device=device) self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) if self.post_layer_norm: LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm # Final layer norm before output. self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype) self.gradient_checkpointing = False def _get_layer(self, layer_number): return self.layers[layer_number] def forward( self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None, use_cache: Optional[bool] = True, output_hidden_states: Optional[bool] = False, ): if not kv_caches: kv_caches = [None for _ in range(self.num_layers)] presents = () if use_cache else None all_self_attentions = None all_hidden_states = () if output_hidden_states else None for index in range(self.num_layers): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer = self._get_layer(index) if self.gradient_checkpointing and self.training: hidden_states, kv_cache = torch.utils.checkpoint.checkpoint( layer, hidden_states, attention_mask, rotary_pos_emb, kv_caches[index], use_cache ) else: hidden_states, kv_cache = layer( hidden_states, attention_mask, rotary_pos_emb, kv_cache=kv_caches[index], use_cache=use_cache ) if use_cache: presents = presents + (kv_cache,) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) # Final layer norm. if self.post_layer_norm: hidden_states = self.final_layernorm(hidden_states) return hidden_states, presents, all_hidden_states, all_self_attentions class ChatGLMPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ is_parallelizable = False supports_gradient_checkpointing = True config_class = ChatGLMConfig base_model_prefix = "transformer" _no_split_modules = ["GLMBlock"] def _init_weights(self, module: nn.Module): """Initialize the weights.""" return def get_masks(self, input_ids, past_key_values, padding_mask=None): batch_size, seq_length = input_ids.shape full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device) full_attention_mask.tril_() past_length = 0 if past_key_values: past_length = past_key_values[0][0].shape[0] if past_length: full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length, device=input_ids.device), full_attention_mask), dim=-1) if padding_mask is not None: full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1) if not past_length and padding_mask is not None: full_attention_mask -= padding_mask.unsqueeze(-1) - 1 full_attention_mask = (full_attention_mask < 0.5).bool() full_attention_mask.unsqueeze_(1) return full_attention_mask def get_position_ids(self, input_ids, device): batch_size, seq_length = input_ids.shape position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) return position_ids def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, ChatGLMModel): module.gradient_checkpointing = value if hasattr(module, "encoder"): encoder = getattr(module, "encoder") encoder.gradient_checkpointing = value class Embedding(torch.nn.Module): """Language model embeddings.""" def __init__(self, config: ChatGLMConfig, device=None): super(Embedding, self).__init__() self.hidden_size = config.hidden_size # Word embeddings (parallel). self.word_embeddings = nn.Embedding( config.padded_vocab_size, self.hidden_size, dtype=config.torch_dtype, device=device ) self.fp32_residual_connection = config.fp32_residual_connection def forward(self, input_ids): # Embeddings. words_embeddings = self.word_embeddings(input_ids) embeddings = words_embeddings # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. embeddings = embeddings.transpose(0, 1).contiguous() # If the input flag for fp32 residual connection is set, convert for float. if self.fp32_residual_connection: embeddings = embeddings.float() return embeddings class ChatGLMModel(ChatGLMPreTrainedModel): def __init__(self, config: ChatGLMConfig, device=None, empty_init=True): super().__init__(config) if empty_init: init_method = skip_init else: init_method = default_init init_kwargs = {} if device is not None: init_kwargs["device"] = device self.embedding = init_method(Embedding, config, **init_kwargs) # Rotary positional embeddings self.seq_length = config.seq_length rotary_dim = ( config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels ) self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device, dtype=config.torch_dtype) self.encoder = init_method(GLMTransformer, config, **init_kwargs) self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False, dtype=config.torch_dtype, **init_kwargs) self.gradient_checkpointing = False def get_input_embeddings(self): return self.embedding def forward( self, input_ids, position_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, full_attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, inputs_embeds: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.gradient_checkpointing and self.training: if use_cache: # logger.warning_once( # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." # ) use_cache = False batch_size, seq_length = input_ids.shape if inputs_embeds is None: inputs_embeds = self.embedding(input_ids) if full_attention_mask is None: if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) # Rotary positional embeddings rotary_pos_emb = self.rotary_pos_emb(self.seq_length) if position_ids is not None: rotary_pos_emb = rotary_pos_emb[position_ids] else: rotary_pos_emb = rotary_pos_emb[None, :seq_length] rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() # Run encoder. hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states ) if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, ) def quantize(self, weight_bit_width: int): from .quantization import quantize quantize(self.encoder, weight_bit_width) return self class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): super().__init__(config) self.max_sequence_length = config.max_length self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) self.config = config self.quantized = False if self.config.quantization_bit: self.quantize(self.config.quantization_bit, empty_init=True) def _update_model_kwargs_for_generation( self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False, standardize_cache_format: bool = False, ) -> Dict[str, Any]: # update past_key_values model_kwargs["past_key_values"] = self._extract_past_from_model_output( outputs, standardize_cache_format=standardize_cache_format ) # update attention mask if "attention_mask" in model_kwargs: attention_mask = model_kwargs["attention_mask"] model_kwargs["attention_mask"] = torch.cat( [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 ) # update position ids if "position_ids" in model_kwargs: position_ids = model_kwargs["position_ids"] new_position_id = position_ids[..., -1:].clone() new_position_id += 1 model_kwargs["position_ids"] = torch.cat( [position_ids, new_position_id], dim=-1 ) model_kwargs["is_first_forward"] = False return model_kwargs def forward( self, input_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, full_attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, return_last_logit: Optional[bool] = False, ): use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, full_attention_mask=full_attention_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] if return_last_logit: hidden_states = hidden_states[-1:] lm_logits = self.transformer.output_layer(hidden_states) lm_logits = lm_logits.transpose(0, 1).contiguous() loss = None if labels is not None: lm_logits = lm_logits.to(torch.float32) # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=self.config.pad_token_id) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) lm_logits = lm_logits.to(hidden_states.dtype) loss = loss.to(hidden_states.dtype) if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) def prepare_inputs_for_generation( self, input_ids: torch.LongTensor, past_key_values: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, is_first_forward: bool = True, **kwargs ) -> dict: # only last token for input_ids if past is not None if position_ids is None: position_ids = self.get_position_ids(input_ids, device=input_ids.device) if not is_first_forward: position_ids = position_ids[..., -1:] input_ids = input_ids[:, -1:] return { "input_ids": input_ids, "past_key_values": past_key_values, "position_ids": position_ids, "attention_mask": attention_mask, "return_last_logit": True } @staticmethod def _reorder_cache( past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: """ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct beam_idx at every generation step. Output shares the same memory storage as `past`. """ return tuple( ( layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)), layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)), ) for layer_past in past ) def process_response(self, response): response = response.strip() response = response.replace("[[训练时间]]", "2023年") return response def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None): prompt = "" for i, (old_query, response) in enumerate(history): prompt += "[Round {}]\n\n问:{}\n\n答:{}\n\n".format(i + 1, old_query, response) prompt += "[Round {}]\n\n问:{}\n\n答:".format(len(history) + 1, query) inputs = tokenizer([prompt], return_tensors="pt") inputs = inputs.to(self.device) return inputs def build_stream_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None): if history: prompt = "\n\n[Round {}]\n\n问:{}\n\n答:".format(len(history) + 1, query) input_ids = tokenizer.encode(prompt, add_special_tokens=False) input_ids = input_ids[1:] inputs = tokenizer.batch_encode_plus([(input_ids, None)], return_tensors="pt", add_special_tokens=False) else: prompt = "[Round {}]\n\n问:{}\n\n答:".format(len(history) + 1, query) inputs = tokenizer([prompt], return_tensors="pt") inputs = inputs.to(self.device) return inputs @torch.no_grad() def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs): if history is None: history = [] if logits_processor is None: logits_processor = LogitsProcessorList() logits_processor.append(InvalidScoreLogitsProcessor()) gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "temperature": temperature, "logits_processor": logits_processor, **kwargs} inputs = self.build_inputs(tokenizer, query, history=history) outputs = self.generate(**inputs, **gen_kwargs) outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):] response = tokenizer.decode(outputs) response = self.process_response(response) history = history + [(query, response)] return response, history @torch.no_grad() def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values=None, max_length: int = 2048, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, return_past_key_values=False, **kwargs): if history is None: history = [] if logits_processor is None: logits_processor = LogitsProcessorList() logits_processor.append(InvalidScoreLogitsProcessor()) gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, "temperature": temperature, "logits_processor": logits_processor, **kwargs} if past_key_values is None and not return_past_key_values: inputs = self.build_inputs(tokenizer, query, history=history) else: inputs = self.build_stream_inputs(tokenizer, query, history=history) if past_key_values is not None: past_length = past_key_values[0][0].shape[0] inputs.position_ids += past_length attention_mask = inputs.attention_mask attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1) inputs['attention_mask'] = attention_mask for outputs in self.stream_generate(**inputs, past_key_values=past_key_values, return_past_key_values=return_past_key_values, **gen_kwargs): if return_past_key_values: outputs, past_key_values = outputs outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):] response = tokenizer.decode(outputs) response = self.process_response(response) new_history = history + [(query, response)] if return_past_key_values: yield response, new_history, past_key_values else: yield response, new_history @torch.no_grad() def stream_generate( self, input_ids, generation_config: Optional[GenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, return_past_key_values=False, **kwargs, ): batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] if generation_config is None: generation_config = self.generation_config generation_config = copy.deepcopy(generation_config) model_kwargs = generation_config.update(**kwargs) bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None if has_default_max_length and generation_config.max_new_tokens is None: warnings.warn( f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" " recommend using `max_new_tokens` to control the maximum length of the generation.", UserWarning, ) elif generation_config.max_new_tokens is not None: generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length if not has_default_max_length: logger.warn( f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " "Please refer to the documentation for more information. " "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", UserWarning, ) if input_ids_seq_length >= generation_config.max_length: input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" logger.warning( f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" " increasing `max_new_tokens`." ) # 2. Set generation parameters if not already defined logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() logits_processor = self._get_logits_processor( generation_config=generation_config, input_ids_seq_length=input_ids_seq_length, encoder_input_ids=input_ids, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, logits_processor=logits_processor, ) stopping_criteria = self._get_stopping_criteria( generation_config=generation_config, stopping_criteria=stopping_criteria ) logits_warper = self._get_logits_warper(generation_config) unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) scores = None while True: model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) # forward pass to get next token outputs = self( **model_inputs, return_dict=True, output_attentions=False, output_hidden_states=False, ) next_token_logits = outputs.logits[:, -1, :] # pre-process distribution next_token_scores = logits_processor(input_ids, next_token_logits) next_token_scores = logits_warper(input_ids, next_token_scores) # sample probs = nn.functional.softmax(next_token_scores, dim=-1) if generation_config.do_sample: next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) else: next_tokens = torch.argmax(probs, dim=-1) # update generated ids, model inputs, and length for next step input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) model_kwargs = self._update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder ) unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long()) if return_past_key_values: yield input_ids, outputs.past_key_values else: yield input_ids # stop when each sentence is finished, or if we exceed the maximum length if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): break def quantize(self, bits: int, empty_init=False, device=None, **kwargs): if bits == 0: return from .quantization import quantize if self.quantized: logger.info("Already quantized.") return self self.quantized = True self.config.quantization_bit = bits self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device, **kwargs) return self ================================================ FILE: src/resources/models/chatglm2-6B/quantization.py ================================================ from torch.nn import Linear from torch.nn.parameter import Parameter import bz2 import torch import base64 import ctypes from transformers.utils import logging from typing import List from functools import partial logger = logging.get_logger(__name__) try: from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up class Kernel: def __init__(self, code: bytes, function_names: List[str]): self.code = code self._function_names = function_names self._cmodule = LazyKernelCModule(self.code) for name in self._function_names: setattr(self, name, KernelFunction(self._cmodule, name)) quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ" kernels = Kernel( bz2.decompress(base64.b64decode(quantization_code)), [ "int4WeightCompression", "int4WeightExtractionFloat", "int4WeightExtractionHalf", "int8WeightExtractionFloat", "int8WeightExtractionHalf", ], ) except Exception as exception: kernels = None logger.warning("Failed to load cpm_kernels:" + str(exception)) class W8A16Linear(torch.autograd.Function): @staticmethod def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): ctx.inp_shape = inp.size() ctx.weight_bit_width = weight_bit_width out_features = quant_w.size(0) inp = inp.contiguous().view(-1, inp.size(-1)) weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) ctx.weight_shape = weight.size() output = inp.mm(weight.t()) ctx.save_for_backward(inp, quant_w, scale_w) return output.view(*(ctx.inp_shape[:-1] + (out_features,))) @staticmethod def backward(ctx, grad_output: torch.Tensor): inp, quant_w, scale_w = ctx.saved_tensors weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) grad_output = grad_output.contiguous().view(-1, weight.size(0)) grad_input = grad_output.mm(weight) grad_weight = grad_output.t().mm(inp) return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None def compress_int4_weight(weight: torch.Tensor): # (n, m) with torch.cuda.device(weight.device): n, m = weight.size(0), weight.size(1) assert m % 2 == 0 m = m // 2 out = torch.empty(n, m, dtype=torch.int8, device="cuda") stream = torch.cuda.current_stream() gridDim = (n, 1, 1) blockDim = (min(round_up(m, 32), 1024), 1, 1) kernels.int4WeightCompression( gridDim, blockDim, 0, stream, [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], ) return out def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): assert scale_list.dtype in [torch.half, torch.bfloat16] assert weight.dtype in [torch.int8] if source_bit_width == 8: return weight.to(scale_list.dtype) * scale_list[:, None] elif source_bit_width == 4: func = ( kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16 ) else: assert False, "Unsupported bit-width" with torch.cuda.device(weight.device): n, m = weight.size(0), weight.size(1) out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda") stream = torch.cuda.current_stream() gridDim = (n, 1, 1) blockDim = (min(round_up(m, 32), 1024), 1, 1) func( gridDim, blockDim, 0, stream, [ ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(scale_list.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m), ], ) return out class QuantizedLinear(torch.nn.Module): def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args, **kwargs): super().__init__() self.weight_bit_width = weight_bit_width shape = weight.shape if weight is None or empty_init: self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device) self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device) else: self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1) self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) if weight_bit_width == 4: self.weight = compress_int4_weight(self.weight) self.weight = Parameter(self.weight.to(device), requires_grad=False) self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False) self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None def forward(self, input): output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width) if self.bias is not None: output = output + self.bias return output def quantize(model, weight_bit_width, empty_init=False, device=None): """Replace fp16 linear with quantized linear""" for layer in model.layers: layer.self_attention.query_key_value = QuantizedLinear( weight_bit_width=weight_bit_width, weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()), bias=layer.self_attention.query_key_value.bias, dtype=layer.self_attention.query_key_value.weight.dtype, device=layer.self_attention.query_key_value.weight.device if device is None else device, empty_init=empty_init ) layer.self_attention.dense = QuantizedLinear( weight_bit_width=weight_bit_width, weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()), bias=layer.self_attention.dense.bias, dtype=layer.self_attention.dense.weight.dtype, device=layer.self_attention.dense.weight.device if device is None else device, empty_init=empty_init ) layer.mlp.dense_h_to_4h = QuantizedLinear( weight_bit_width=weight_bit_width, weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), bias=layer.mlp.dense_h_to_4h.bias, dtype=layer.mlp.dense_h_to_4h.weight.dtype, device=layer.mlp.dense_h_to_4h.weight.device if device is None else device, empty_init=empty_init ) layer.mlp.dense_4h_to_h = QuantizedLinear( weight_bit_width=weight_bit_width, weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), bias=layer.mlp.dense_4h_to_h.bias, dtype=layer.mlp.dense_4h_to_h.weight.dtype, device=layer.mlp.dense_4h_to_h.weight.device if device is None else device, empty_init=empty_init ) return model ================================================ FILE: src/resources/models/chatglm2-6B/tokenization_chatglm.py ================================================ import os import torch from typing import List, Optional, Union, Dict from sentencepiece import SentencePieceProcessor from transformers import PreTrainedTokenizer from transformers.utils import logging, PaddingStrategy from transformers.tokenization_utils_base import EncodedInput, BatchEncoding class SPTokenizer: def __init__(self, model_path: str): # reload tokenizer assert os.path.isfile(model_path), model_path self.sp_model = SentencePieceProcessor(model_file=model_path) # BOS / EOS token IDs self.n_words: int = self.sp_model.vocab_size() self.bos_id: int = self.sp_model.bos_id() self.eos_id: int = self.sp_model.eos_id() self.pad_id: int = self.sp_model.eos_id() assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] self.special_tokens = {} self.index_special_tokens = {} for token in special_tokens: self.special_tokens[token] = self.n_words self.index_special_tokens[self.n_words] = token self.n_words += 1 def tokenize(self, s: str): return self.sp_model.EncodeAsPieces(s) def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: assert type(s) is str t = self.sp_model.encode(s) if bos: t = [self.bos_id] + t if eos: t = t + [self.eos_id] return t def decode(self, t: List[int]) -> str: return self.sp_model.decode(t) def decode_tokens(self, tokens: List[str]) -> str: text = self.sp_model.DecodePieces(tokens) return text def convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token in self.special_tokens: return self.special_tokens[token] return self.sp_model.PieceToId(token) def convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index in self.index_special_tokens: return "" return self.sp_model.IdToPiece(index) class ChatGLMTokenizer(PreTrainedTokenizer): vocab_files_names = {"vocab_file": "tokenizer.model"} model_input_names = ["input_ids", "attention_mask", "position_ids"] def __init__(self, vocab_file, padding_side="right", **kwargs): super().__init__(padding_side=padding_side, **kwargs) self.name = "GLMTokenizer" self.vocab_file = vocab_file self.tokenizer = SPTokenizer(vocab_file) self.special_tokens = { "": self.tokenizer.bos_id, "": self.tokenizer.eos_id, "": self.tokenizer.pad_id } def get_command(self, token): if token in self.special_tokens: return self.special_tokens[token] assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" return self.tokenizer.special_tokens[token] @property def pad_token(self) -> str: return "" @property def pad_token_id(self): return self.get_command("") @property def vocab_size(self): return self.tokenizer.n_words def get_vocab(self): """ Returns vocab as a dict """ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text, **kwargs): return self.tokenizer.tokenize(text) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.tokenizer.convert_token_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.tokenizer.convert_id_to_token(index) def convert_tokens_to_string(self, tokens: List[str]) -> str: current_sub_tokens = [] out_string = "" prev_is_special = False for i, token in enumerate(tokens): # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: if not prev_is_special and i != 0: out_string += " " out_string += self.tokenizer.decode_tokens(current_sub_tokens) + token prev_is_special = True current_sub_tokens = [] else: current_sub_tokens.append(token) prev_is_special = False out_string += self.tokenizer.decode_tokens(current_sub_tokens) return out_string def save_vocabulary(self, save_directory, filename_prefix=None): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. filename_prefix (`str`, *optional*): An optional prefix to add to the named of the saved files. Returns: `Tuple(str)`: Paths to the files saved. """ if os.path.isdir(save_directory): vocab_file = os.path.join( save_directory, self.vocab_files_names["vocab_file"] ) else: vocab_file = save_directory with open(self.vocab_file, 'rb') as fin: proto_str = fin.read() with open(vocab_file, "wb") as writer: writer.write(proto_str) return (vocab_file,) def get_prefix_tokens(self): prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] return prefix_tokens def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: `[CLS] X [SEP]` - pair of sequences: `[CLS] A [SEP] B [SEP]` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ prefix_tokens = self.get_prefix_tokens() token_ids_0 = prefix_tokens + token_ids_0 if token_ids_1 is not None: # token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("eop")] return token_ids_0 def _pad( self, encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ Pad encoded inputs (on left/right and up to predefined length or max length in the batch) Args: encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). max_length: maximum length of the returned list and optionally padding length (see below). Will truncate by taking into account the special tokens. padding_strategy: PaddingStrategy to use for padding. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - PaddingStrategy.DO_NOT_PAD: Do not pad The tokenizer padding sides are defined in self.padding_side: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ # Load from model defaults # assert self.padding_side == "left" required_input = encoded_inputs[self.model_input_names[0]] seq_length = len(required_input) if padding_strategy == PaddingStrategy.LONGEST: max_length = len(required_input) if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length # Initialize attention mask if not present. if "attention_mask" not in encoded_inputs: encoded_inputs["attention_mask"] = [1] * seq_length if "position_ids" not in encoded_inputs: encoded_inputs["position_ids"] = list(range(seq_length)) if needs_to_be_padded: difference = max_length - len(required_input) if self.padding_side == "left": if "attention_mask" in encoded_inputs: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "position_ids" in encoded_inputs: encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: if "attention_mask" in encoded_inputs: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "position_ids" in encoded_inputs: encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference return encoded_inputs ================================================ FILE: src/resources/models/chatglm2-6B/tokenizer_config.json ================================================ { "name_or_path": "THUDM/chatglm-6b", "remove_space": false, "do_lower_case": false, "tokenizer_class": "ChatGLMTokenizer", "auto_map": { "AutoTokenizer": [ "tokenization_chatglm.ChatGLMTokenizer", null ] } } ================================================ FILE: src/resources/models/chatglm3-6B/config.json ================================================ { "_name_or_path": "THUDM/chatglm3-6b", "model_type": "chatglm", "architectures": [ "ChatGLMModel" ], "auto_map": { "AutoConfig": "configuration_chatglm.ChatGLMConfig", "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" }, "add_bias_linear": false, "add_qkv_bias": true, "apply_query_key_layer_scaling": true, "apply_residual_connection_post_layernorm": false, "attention_dropout": 0.0, "attention_softmax_in_fp32": true, "bias_dropout_fusion": true, "ffn_hidden_size": 13696, "fp32_residual_connection": false, "hidden_dropout": 0.0, "hidden_size": 4096, "kv_channels": 128, "layernorm_epsilon": 1e-05, "multi_query_attention": true, "multi_query_group_num": 2, "num_attention_heads": 32, "num_layers": 28, "original_rope": true, "padded_vocab_size": 65024, "post_layer_norm": true, "rmsnorm": true, "seq_length": 8192, "use_cache": true, "torch_dtype": "float16", "transformers_version": "4.30.2", "tie_word_embeddings": false, "eos_token_id": 2, "pad_token_id": -100 } ================================================ FILE: src/resources/models/chatglm3-6B/configuration_chatglm.py ================================================ from transformers import PretrainedConfig class ChatGLMConfig(PretrainedConfig): model_type = "chatglm" def __init__( self, num_layers=28, padded_vocab_size=65024, hidden_size=4096, ffn_hidden_size=13696, kv_channels=128, num_attention_heads=32, seq_length=2048, hidden_dropout=0.0, classifier_dropout=None, attention_dropout=0.0, layernorm_epsilon=1e-5, rmsnorm=True, apply_residual_connection_post_layernorm=False, post_layer_norm=True, add_bias_linear=False, add_qkv_bias=False, bias_dropout_fusion=True, multi_query_attention=False, multi_query_group_num=1, apply_query_key_layer_scaling=True, attention_softmax_in_fp32=True, fp32_residual_connection=False, quantization_bit=0, pre_seq_len=None, prefix_projection=False, **kwargs ): self.num_layers = num_layers self.vocab_size = padded_vocab_size self.padded_vocab_size = padded_vocab_size self.hidden_size = hidden_size self.ffn_hidden_size = ffn_hidden_size self.kv_channels = kv_channels self.num_attention_heads = num_attention_heads self.seq_length = seq_length self.hidden_dropout = hidden_dropout self.classifier_dropout = classifier_dropout self.attention_dropout = attention_dropout self.layernorm_epsilon = layernorm_epsilon self.rmsnorm = rmsnorm self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm self.post_layer_norm = post_layer_norm self.add_bias_linear = add_bias_linear self.add_qkv_bias = add_qkv_bias self.bias_dropout_fusion = bias_dropout_fusion self.multi_query_attention = multi_query_attention self.multi_query_group_num = multi_query_group_num self.apply_query_key_layer_scaling = apply_query_key_layer_scaling self.attention_softmax_in_fp32 = attention_softmax_in_fp32 self.fp32_residual_connection = fp32_residual_connection self.quantization_bit = quantization_bit self.pre_seq_len = pre_seq_len self.prefix_projection = prefix_projection super().__init__(**kwargs) ================================================ FILE: src/resources/models/chatglm3-6B/modeling_chatglm.py ================================================ """ PyTorch ChatGLM model. """ import math import copy import warnings import re import sys import torch import torch.utils.checkpoint import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss from torch.nn.utils import skip_init from typing import Optional, Tuple, Union, List, Callable, Dict, Any from copy import deepcopy from transformers.modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast, ) from transformers.modeling_utils import PreTrainedModel from transformers.utils import logging from transformers.generation.logits_process import LogitsProcessor from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput from .configuration_chatglm import ChatGLMConfig # flags required to enable jit fusion kernels if sys.platform != 'darwin': torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) torch._C._jit_override_can_fuse_on_cpu(True) torch._C._jit_override_can_fuse_on_gpu(True) logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM" _CONFIG_FOR_DOC = "ChatGLMConfig" CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [ "THUDM/chatglm3-6b", # See all ChatGLM models at https://huggingface.co/models?filter=chatglm ] def default_init(cls, *args, **kwargs): return cls(*args, **kwargs) class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): scores.zero_() scores[..., 5] = 5e4 return scores class PrefixEncoder(torch.nn.Module): """ The torch.nn model to encode the prefix Input shape: (batch-size, prefix-length) Output shape: (batch-size, prefix-length, 2*layers*hidden) """ def __init__(self, config: ChatGLMConfig): super().__init__() self.prefix_projection = config.prefix_projection if self.prefix_projection: # Use a two-layer MLP to encode the prefix kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2 self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size) self.trans = torch.nn.Sequential( torch.nn.Linear(kv_size, config.hidden_size), torch.nn.Tanh(), torch.nn.Linear(config.hidden_size, kv_size) ) else: self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.kv_channels * config.multi_query_group_num * 2) def forward(self, prefix: torch.Tensor): if self.prefix_projection: prefix_tokens = self.embedding(prefix) past_key_values = self.trans(prefix_tokens) else: past_key_values = self.embedding(prefix) return past_key_values def split_tensor_along_last_dim( tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False, ) -> List[torch.Tensor]: """Split a tensor along its last dimension. Arguments: tensor: input tensor. num_partitions: number of partitions to split the tensor contiguous_split_chunks: If True, make each chunk contiguous in memory. Returns: A list of Tensors """ # Get the size and dimension. last_dim = tensor.dim() - 1 last_dim_size = tensor.size()[last_dim] // num_partitions # Split. tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) # Note: torch.split does not create contiguous tensors by default. if contiguous_split_chunks: return tuple(chunk.contiguous() for chunk in tensor_list) return tensor_list class RotaryEmbedding(nn.Module): def __init__(self, dim, original_impl=False, device=None, dtype=None): super().__init__() inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)) self.register_buffer("inv_freq", inv_freq) self.dim = dim self.original_impl = original_impl def forward_impl( self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000 ): """Enhanced Transformer with Rotary Position Embedding. Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ transformers/rope/__init__.py. MIT License: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. """ # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem)) # Create position indexes `[0, 1, ..., seq_len - 1]` seq_idx = torch.arange(seq_len, dtype=torch.float, device=device) # Calculate the product of position index and $\theta_i$ idx_theta = torch.outer(seq_idx, theta).float() cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) # this is to mimic the behaviour of complex32, else we will get different results if dtype in (torch.float16, torch.bfloat16, torch.int8): cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() return cache def forward(self, max_seq_len, offset=0): return self.forward_impl( max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device ) @torch.jit.script def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: # x: [sq, b, np, hn] sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) rot_dim = rope_cache.shape[-2] * 2 x, x_pass = x[..., :rot_dim], x[..., rot_dim:] # truncate to support variable sizes rope_cache = rope_cache[:sq] xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2) rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2) x_out2 = torch.stack( [ xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], ], -1, ) x_out2 = x_out2.flatten(3) return torch.cat((x_out2, x_pass), dim=-1) class RMSNorm(torch.nn.Module): def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs): super().__init__() self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype)) self.eps = eps def forward(self, hidden_states: torch.Tensor): input_dtype = hidden_states.dtype variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.eps) return (self.weight * hidden_states).to(input_dtype) class CoreAttention(torch.nn.Module): def __init__(self, config: ChatGLMConfig, layer_number): super(CoreAttention, self).__init__() self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 if self.apply_query_key_layer_scaling: self.attention_softmax_in_fp32 = True self.layer_number = max(1, layer_number) projection_size = config.kv_channels * config.num_attention_heads # Per attention head and per partition values. self.hidden_size_per_partition = projection_size self.hidden_size_per_attention_head = projection_size // config.num_attention_heads self.num_attention_heads_per_partition = config.num_attention_heads coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) if self.apply_query_key_layer_scaling: coeff = self.layer_number self.norm_factor *= coeff self.coeff = coeff self.attention_dropout = torch.nn.Dropout(config.attention_dropout) def forward(self, query_layer, key_layer, value_layer, attention_mask): pytorch_major_version = int(torch.__version__.split('.')[0]) if pytorch_major_version >= 2: query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]: context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, is_causal=True) else: if attention_mask is not None: attention_mask = ~attention_mask context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, attention_mask) context_layer = context_layer.permute(2, 0, 1, 3) new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) context_layer = context_layer.reshape(*new_context_layer_shape) else: # Raw attention scores # [b, np, sq, sk] output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) # [sq, b, np, hn] -> [sq, b * np, hn] query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) # [sk, b, np, hn] -> [sk, b * np, hn] key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) # preallocting input tensor: [b * np, sq, sk] matmul_input_buffer = torch.empty( output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, device=query_layer.device ) # Raw attention scores. [b * np, sq, sk] matmul_result = torch.baddbmm( matmul_input_buffer, query_layer.transpose(0, 1), # [b * np, sq, hn] key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] beta=0.0, alpha=(1.0 / self.norm_factor), ) # change view to [b, np, sq, sk] attention_scores = matmul_result.view(*output_size) # =========================== # Attention probs and dropout # =========================== # attention scores and attention mask [b, np, sq, sk] if self.attention_softmax_in_fp32: attention_scores = attention_scores.float() if self.coeff is not None: attention_scores = attention_scores * self.coeff if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]: attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3], device=attention_scores.device, dtype=torch.bool) attention_mask.tril_() attention_mask = ~attention_mask if attention_mask is not None: attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) attention_probs = F.softmax(attention_scores, dim=-1) attention_probs = attention_probs.type_as(value_layer) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.attention_dropout(attention_probs) # ========================= # Context layer. [sq, b, hp] # ========================= # value_layer -> context layer. # [sk, b, np, hn] --> [b, np, sq, hn] # context layer shape: [b, np, sq, hn] output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) # change view [sk, b * np, hn] value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) # change view [b * np, sq, sk] attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) # matmul: [b * np, sq, hn] context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) # change view [b, np, sq, hn] context_layer = context_layer.view(*output_size) # [b, np, sq, hn] --> [sq, b, np, hn] context_layer = context_layer.permute(2, 0, 1, 3).contiguous() # [sq, b, np, hn] --> [sq, b, hp] new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) context_layer = context_layer.view(*new_context_layer_shape) return context_layer class SelfAttention(torch.nn.Module): """Parallel self-attention layer abstract class. Self-attention layer takes input with size [s, b, h] and returns output of the same size. """ def __init__(self, config: ChatGLMConfig, layer_number, device=None): super(SelfAttention, self).__init__() self.layer_number = max(1, layer_number) self.projection_size = config.kv_channels * config.num_attention_heads # Per attention head and per partition values. self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads self.num_attention_heads_per_partition = config.num_attention_heads self.multi_query_attention = config.multi_query_attention self.qkv_hidden_size = 3 * self.projection_size if self.multi_query_attention: self.num_multi_query_groups_per_partition = config.multi_query_group_num self.qkv_hidden_size = ( self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num ) self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size, bias=config.add_bias_linear or config.add_qkv_bias, device=device, **_config_to_kwargs(config) ) self.core_attention = CoreAttention(config, self.layer_number) # Output. self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear, device=device, **_config_to_kwargs(config) ) def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None): if self.multi_query_attention: num_attention_heads = self.num_multi_query_groups_per_partition else: num_attention_heads = self.num_attention_heads_per_partition return torch.empty( inference_max_sequence_len, batch_size, num_attention_heads, self.hidden_size_per_attention_head, dtype=dtype, device=device, ) def forward( self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True ): # hidden_states: [sq, b, h] # ================================================= # Pre-allocate memory for key-values for inference. # ================================================= # ===================== # Query, Key, and Value # ===================== # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] mixed_x_layer = self.query_key_value(hidden_states) if self.multi_query_attention: (query_layer, key_layer, value_layer) = mixed_x_layer.split( [ self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, ], dim=-1, ) query_layer = query_layer.view( query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) ) key_layer = key_layer.view( key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) ) value_layer = value_layer.view( value_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) ) else: new_tensor_shape = mixed_x_layer.size()[:-1] + \ (self.num_attention_heads_per_partition, 3 * self.hidden_size_per_attention_head) mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) # apply relative positional encoding (rotary embedding) if rotary_pos_emb is not None: query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) # adjust key and value for inference if kv_cache is not None: cache_k, cache_v = kv_cache key_layer = torch.cat((cache_k, key_layer), dim=0) value_layer = torch.cat((cache_v, value_layer), dim=0) if use_cache: kv_cache = (key_layer, value_layer) else: kv_cache = None if self.multi_query_attention: key_layer = key_layer.unsqueeze(-2) key_layer = key_layer.expand( -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 ) key_layer = key_layer.contiguous().view( key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) ) value_layer = value_layer.unsqueeze(-2) value_layer = value_layer.expand( -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 ) value_layer = value_layer.contiguous().view( value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) ) # ================================== # core attention computation # ================================== context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) # ================= # Output. [sq, b, h] # ================= output = self.dense(context_layer) return output, kv_cache def _config_to_kwargs(args): common_kwargs = { "dtype": args.torch_dtype, } return common_kwargs class MLP(torch.nn.Module): """MLP. MLP will take the input with h hidden state, project it to 4*h hidden dimension, perform nonlinear transformation, and project the state back into h hidden dimension. """ def __init__(self, config: ChatGLMConfig, device=None): super(MLP, self).__init__() self.add_bias = config.add_bias_linear # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf self.dense_h_to_4h = nn.Linear( config.hidden_size, config.ffn_hidden_size * 2, bias=self.add_bias, device=device, **_config_to_kwargs(config) ) def swiglu(x): x = torch.chunk(x, 2, dim=-1) return F.silu(x[0]) * x[1] self.activation_func = swiglu # Project back to h. self.dense_4h_to_h = nn.Linear( config.ffn_hidden_size, config.hidden_size, bias=self.add_bias, device=device, **_config_to_kwargs(config) ) def forward(self, hidden_states): # [s, b, 4hp] intermediate_parallel = self.dense_h_to_4h(hidden_states) intermediate_parallel = self.activation_func(intermediate_parallel) # [s, b, h] output = self.dense_4h_to_h(intermediate_parallel) return output class GLMBlock(torch.nn.Module): """A single transformer layer. Transformer layer takes input with size [s, b, h] and returns an output of the same size. """ def __init__(self, config: ChatGLMConfig, layer_number, device=None): super(GLMBlock, self).__init__() self.layer_number = layer_number self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm self.fp32_residual_connection = config.fp32_residual_connection LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm # Layernorm on the input data. self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype) # Self attention. self.self_attention = SelfAttention(config, layer_number, device=device) self.hidden_dropout = config.hidden_dropout # Layernorm on the attention output self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype) # MLP self.mlp = MLP(config, device=device) def forward( self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True, ): # hidden_states: [s, b, h] # Layer norm at the beginning of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) # Self attention. attention_output, kv_cache = self.self_attention( layernorm_output, attention_mask, rotary_pos_emb, kv_cache=kv_cache, use_cache=use_cache ) # Residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = hidden_states layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training) layernorm_input = residual + layernorm_input # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) # MLP. mlp_output = self.mlp(layernorm_output) # Second residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = layernorm_input output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training) output = residual + output return output, kv_cache class GLMTransformer(torch.nn.Module): """Transformer class.""" def __init__(self, config: ChatGLMConfig, device=None): super(GLMTransformer, self).__init__() self.fp32_residual_connection = config.fp32_residual_connection self.post_layer_norm = config.post_layer_norm # Number of layers. self.num_layers = config.num_layers # Transformer layers. def build_layer(layer_number): return GLMBlock(config, layer_number, device=device) self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) if self.post_layer_norm: LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm # Final layer norm before output. self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype) self.gradient_checkpointing = False def _get_layer(self, layer_number): return self.layers[layer_number] def forward( self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None, use_cache: Optional[bool] = True, output_hidden_states: Optional[bool] = False, ): if not kv_caches: kv_caches = [None for _ in range(self.num_layers)] presents = () if use_cache else None if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False all_self_attentions = None all_hidden_states = () if output_hidden_states else None for index in range(self.num_layers): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer = self._get_layer(index) if self.gradient_checkpointing and self.training: layer_ret = torch.utils.checkpoint.checkpoint( layer, hidden_states, attention_mask, rotary_pos_emb, kv_caches[index], use_cache ) else: layer_ret = layer( hidden_states, attention_mask, rotary_pos_emb, kv_cache=kv_caches[index], use_cache=use_cache ) hidden_states, kv_cache = layer_ret if use_cache: presents = presents + (kv_cache,) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) # Final layer norm. if self.post_layer_norm: hidden_states = self.final_layernorm(hidden_states) return hidden_states, presents, all_hidden_states, all_self_attentions class ChatGLMPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ is_parallelizable = False supports_gradient_checkpointing = True config_class = ChatGLMConfig base_model_prefix = "transformer" _no_split_modules = ["GLMBlock"] def _init_weights(self, module: nn.Module): """Initialize the weights.""" return def get_masks(self, input_ids, past_key_values, padding_mask=None): batch_size, seq_length = input_ids.shape full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device) full_attention_mask.tril_() past_length = 0 if past_key_values: past_length = past_key_values[0][0].shape[0] if past_length: full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length, device=input_ids.device), full_attention_mask), dim=-1) if padding_mask is not None: full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1) if not past_length and padding_mask is not None: full_attention_mask -= padding_mask.unsqueeze(-1) - 1 full_attention_mask = (full_attention_mask < 0.5).bool() full_attention_mask.unsqueeze_(1) return full_attention_mask def get_position_ids(self, input_ids, device): batch_size, seq_length = input_ids.shape position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) return position_ids def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, GLMTransformer): module.gradient_checkpointing = value class Embedding(torch.nn.Module): """Language model embeddings.""" def __init__(self, config: ChatGLMConfig, device=None): super(Embedding, self).__init__() self.hidden_size = config.hidden_size # Word embeddings (parallel). self.word_embeddings = nn.Embedding( config.padded_vocab_size, self.hidden_size, dtype=config.torch_dtype, device=device ) self.fp32_residual_connection = config.fp32_residual_connection def forward(self, input_ids): # Embeddings. words_embeddings = self.word_embeddings(input_ids) embeddings = words_embeddings # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. embeddings = embeddings.transpose(0, 1).contiguous() # If the input flag for fp32 residual connection is set, convert for float. if self.fp32_residual_connection: embeddings = embeddings.float() return embeddings class ChatGLMModel(ChatGLMPreTrainedModel): def __init__(self, config: ChatGLMConfig, device=None, empty_init=True): super().__init__(config) if empty_init: init_method = skip_init else: init_method = default_init init_kwargs = {} if device is not None: init_kwargs["device"] = device self.embedding = init_method(Embedding, config, **init_kwargs) self.num_layers = config.num_layers self.multi_query_group_num = config.multi_query_group_num self.kv_channels = config.kv_channels # Rotary positional embeddings self.seq_length = config.seq_length rotary_dim = ( config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels ) self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device, dtype=config.torch_dtype) self.encoder = init_method(GLMTransformer, config, **init_kwargs) self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False, dtype=config.torch_dtype, **init_kwargs) self.pre_seq_len = config.pre_seq_len self.prefix_projection = config.prefix_projection if self.pre_seq_len is not None: for param in self.parameters(): param.requires_grad = False self.prefix_tokens = torch.arange(self.pre_seq_len).long() self.prefix_encoder = PrefixEncoder(config) self.dropout = torch.nn.Dropout(0.1) def get_input_embeddings(self): return self.embedding.word_embeddings def get_prompt(self, batch_size, device, dtype=torch.half): prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device) past_key_values = self.prefix_encoder(prefix_tokens).type(dtype) past_key_values = past_key_values.view( batch_size, self.pre_seq_len, self.num_layers * 2, self.multi_query_group_num, self.kv_channels ) # seq_len, b, nh, hidden_size past_key_values = self.dropout(past_key_values) past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2) return past_key_values def forward( self, input_ids, position_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.BoolTensor] = None, full_attention_mask: Optional[torch.BoolTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, inputs_embeds: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict batch_size, seq_length = input_ids.shape if inputs_embeds is None: inputs_embeds = self.embedding(input_ids) if self.pre_seq_len is not None: if past_key_values is None: past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device, dtype=inputs_embeds.dtype) if attention_mask is not None: attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)), attention_mask], dim=-1) if full_attention_mask is None: if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) # Rotary positional embeddings rotary_pos_emb = self.rotary_pos_emb(self.seq_length) if position_ids is not None: rotary_pos_emb = rotary_pos_emb[position_ids] else: rotary_pos_emb = rotary_pos_emb[None, :seq_length] rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() # Run encoder. hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states ) if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, ) def quantize(self, weight_bit_width: int): from .quantization import quantize quantize(self.encoder, weight_bit_width) return self class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): super().__init__(config) self.max_sequence_length = config.max_length self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) self.config = config self.quantized = False if self.config.quantization_bit: self.quantize(self.config.quantization_bit, empty_init=True) def _update_model_kwargs_for_generation( self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False, standardize_cache_format: bool = False, ) -> Dict[str, Any]: # update past_key_values model_kwargs["past_key_values"] = self._extract_past_from_model_output( outputs, standardize_cache_format=standardize_cache_format ) # update attention mask if "attention_mask" in model_kwargs: attention_mask = model_kwargs["attention_mask"] model_kwargs["attention_mask"] = torch.cat( [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 ) # update position ids if "position_ids" in model_kwargs: position_ids = model_kwargs["position_ids"] new_position_id = position_ids[..., -1:].clone() new_position_id += 1 model_kwargs["position_ids"] = torch.cat( [position_ids, new_position_id], dim=-1 ) model_kwargs["is_first_forward"] = False return model_kwargs def prepare_inputs_for_generation( self, input_ids: torch.LongTensor, past_key_values: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, is_first_forward: bool = True, **kwargs ) -> dict: # only last token for input_ids if past is not None if position_ids is None: position_ids = self.get_position_ids(input_ids, device=input_ids.device) if not is_first_forward: if past_key_values is not None: position_ids = position_ids[..., -1:] input_ids = input_ids[:, -1:] return { "input_ids": input_ids, "past_key_values": past_key_values, "position_ids": position_ids, "attention_mask": attention_mask, "return_last_logit": True, "use_cache": use_cache } def forward( self, input_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, return_last_logit: Optional[bool] = False, ): use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] if return_last_logit: hidden_states = hidden_states[-1:] lm_logits = self.transformer.output_layer(hidden_states) lm_logits = lm_logits.transpose(0, 1).contiguous() loss = None if labels is not None: lm_logits = lm_logits.to(torch.float32) # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=self.config.pad_token_id) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) lm_logits = lm_logits.to(hidden_states.dtype) loss = loss.to(hidden_states.dtype) if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) @staticmethod def _reorder_cache( past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: """ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct beam_idx at every generation step. Output shares the same memory storage as `past`. """ return tuple( ( layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)), layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)), ) for layer_past in past ) def process_response(self, output, history): content = "" history = deepcopy(history) for response in output.split("<|assistant|>"): metadata, content = response.split("\n", maxsplit=1) if not metadata.strip(): content = content.strip() history.append({"role": "assistant", "metadata": metadata, "content": content}) content = content.replace("[[训练时间]]", "2023年") else: history.append({"role": "assistant", "metadata": metadata, "content": content}) if history[0]["role"] == "system" and "tools" in history[0]: content = "\n".join(content.split("\n")[1:-1]) def tool_call(**kwargs): return kwargs parameters = eval(content) content = {"name": metadata.strip(), "parameters": parameters} else: content = {"name": metadata.strip(), "content": content} return content, history @torch.inference_mode() def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user", max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs): if history is None: history = [] if logits_processor is None: logits_processor = LogitsProcessorList() logits_processor.append(InvalidScoreLogitsProcessor()) gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "temperature": temperature, "logits_processor": logits_processor, **kwargs} inputs = tokenizer.build_chat_input(query, history=history, role=role) inputs = inputs.to(self.device) eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), tokenizer.get_command("<|observation|>")] outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id) outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] response = tokenizer.decode(outputs) history.append({"role": role, "content": query}) response, history = self.process_response(response, history) return response, history @torch.inference_mode() def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user", past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, return_past_key_values=False, **kwargs): if history is None: history = [] if logits_processor is None: logits_processor = LogitsProcessorList() logits_processor.append(InvalidScoreLogitsProcessor()) eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), tokenizer.get_command("<|observation|>")] gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, "temperature": temperature, "logits_processor": logits_processor, **kwargs} if past_key_values is None: inputs = tokenizer.build_chat_input(query, history=history, role=role) else: inputs = tokenizer.build_chat_input(query, role=role) inputs = inputs.to(self.device) if past_key_values is not None: past_length = past_key_values[0][0].shape[0] if self.transformer.pre_seq_len is not None: past_length -= self.transformer.pre_seq_len inputs.position_ids += past_length attention_mask = inputs.attention_mask attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1) inputs['attention_mask'] = attention_mask history.append({"role": role, "content": query}) for outputs in self.stream_generate(**inputs, past_key_values=past_key_values, eos_token_id=eos_token_id, return_past_key_values=return_past_key_values, **gen_kwargs): if return_past_key_values: outputs, past_key_values = outputs outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] response = tokenizer.decode(outputs) if response and response[-1] != "�": response, new_history = self.process_response(response, history) if return_past_key_values: yield response, new_history, past_key_values else: yield response, new_history @torch.inference_mode() def stream_generate( self, input_ids, generation_config: Optional[GenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, return_past_key_values=False, **kwargs, ): batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] if generation_config is None: generation_config = self.generation_config generation_config = copy.deepcopy(generation_config) model_kwargs = generation_config.update(**kwargs) model_kwargs["use_cache"] = generation_config.use_cache bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None if has_default_max_length and generation_config.max_new_tokens is None: warnings.warn( f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" " recommend using `max_new_tokens` to control the maximum length of the generation.", UserWarning, ) elif generation_config.max_new_tokens is not None: generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length if not has_default_max_length: logger.warn( f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " "Please refer to the documentation for more information. " "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", UserWarning, ) if input_ids_seq_length >= generation_config.max_length: input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" logger.warning( f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" " increasing `max_new_tokens`." ) # 2. Set generation parameters if not already defined logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() logits_processor = self._get_logits_processor( generation_config=generation_config, input_ids_seq_length=input_ids_seq_length, encoder_input_ids=input_ids, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, logits_processor=logits_processor, ) stopping_criteria = self._get_stopping_criteria( generation_config=generation_config, stopping_criteria=stopping_criteria ) logits_warper = self._get_logits_warper(generation_config) unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) scores = None while True: model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) # forward pass to get next token outputs = self( **model_inputs, return_dict=True, output_attentions=False, output_hidden_states=False, ) next_token_logits = outputs.logits[:, -1, :] # pre-process distribution next_token_scores = logits_processor(input_ids, next_token_logits) next_token_scores = logits_warper(input_ids, next_token_scores) # sample probs = nn.functional.softmax(next_token_scores, dim=-1) if generation_config.do_sample: next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) else: next_tokens = torch.argmax(probs, dim=-1) # update generated ids, model inputs, and length for next step input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) model_kwargs = self._update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder ) unfinished_sequences = unfinished_sequences.mul( next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) ) if return_past_key_values: yield input_ids, outputs.past_key_values else: yield input_ids # stop when each sentence is finished, or if we exceed the maximum length if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): break def quantize(self, bits: int, empty_init=False, device=None, **kwargs): if bits == 0: return from .quantization import quantize if self.quantized: logger.info("Already quantized.") return self self.quantized = True self.config.quantization_bit = bits self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device, **kwargs) return self class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel): def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): super().__init__(config) self.num_labels = config.num_labels self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half) if config.classifier_dropout is not None: self.dropout = nn.Dropout(config.classifier_dropout) else: self.dropout = None self.config = config if self.config.quantization_bit: self.quantize(self.config.quantization_bit, empty_init=True) def forward( self, input_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, full_attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, inputs_embeds: Optional[torch.LongTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]: return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, full_attention_mask=full_attention_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] pooled_hidden_states = hidden_states[-1] if self.dropout is not None: pooled_hidden_states = self.dropout(pooled_hidden_states) logits = self.classifier_head(pooled_hidden_states) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze().float(), labels.squeeze()) else: loss = loss_fct(logits.float(), labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits.float(), labels.view(-1, self.num_labels)) if not return_dict: output = (logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, logits=logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) ================================================ FILE: src/resources/models/chatglm3-6B/quantization.py ================================================ from torch.nn import Linear from torch.nn.parameter import Parameter import bz2 import torch import base64 import ctypes from transformers.utils import logging from typing import List from functools import partial logger = logging.get_logger(__name__) try: from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up class Kernel: def __init__(self, code: bytes, function_names: List[str]): self.code = code self._function_names = function_names self._cmodule = LazyKernelCModule(self.code) for name in self._function_names: setattr(self, name, KernelFunction(self._cmodule, name)) quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ" kernels = Kernel( bz2.decompress(base64.b64decode(quantization_code)), [ "int4WeightCompression", "int4WeightExtractionFloat", "int4WeightExtractionHalf", "int8WeightExtractionFloat", "int8WeightExtractionHalf", ], ) except Exception as exception: kernels = None logger.warning("Failed to load cpm_kernels:" + str(exception)) class W8A16Linear(torch.autograd.Function): @staticmethod def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): ctx.inp_shape = inp.size() ctx.weight_bit_width = weight_bit_width out_features = quant_w.size(0) inp = inp.contiguous().view(-1, inp.size(-1)) weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) ctx.weight_shape = weight.size() output = inp.mm(weight.t()) ctx.save_for_backward(inp, quant_w, scale_w) return output.view(*(ctx.inp_shape[:-1] + (out_features,))) @staticmethod def backward(ctx, grad_output: torch.Tensor): inp, quant_w, scale_w = ctx.saved_tensors weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) grad_output = grad_output.contiguous().view(-1, weight.size(0)) grad_input = grad_output.mm(weight) grad_weight = grad_output.t().mm(inp) return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None def compress_int4_weight(weight: torch.Tensor): # (n, m) with torch.cuda.device(weight.device): n, m = weight.size(0), weight.size(1) assert m % 2 == 0 m = m // 2 out = torch.empty(n, m, dtype=torch.int8, device="cuda") stream = torch.cuda.current_stream() gridDim = (n, 1, 1) blockDim = (min(round_up(m, 32), 1024), 1, 1) kernels.int4WeightCompression( gridDim, blockDim, 0, stream, [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], ) return out def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): assert scale_list.dtype in [torch.half, torch.bfloat16] assert weight.dtype in [torch.int8] if source_bit_width == 8: return weight.to(scale_list.dtype) * scale_list[:, None] elif source_bit_width == 4: func = ( kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16 ) else: assert False, "Unsupported bit-width" with torch.cuda.device(weight.device): n, m = weight.size(0), weight.size(1) out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda") stream = torch.cuda.current_stream() gridDim = (n, 1, 1) blockDim = (min(round_up(m, 32), 1024), 1, 1) func( gridDim, blockDim, 0, stream, [ ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(scale_list.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m), ], ) return out class QuantizedLinear(torch.nn.Module): def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args, **kwargs): super().__init__() self.weight_bit_width = weight_bit_width shape = weight.shape if weight is None or empty_init: self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device) self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device) else: self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1) self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) if weight_bit_width == 4: self.weight = compress_int4_weight(self.weight) self.weight = Parameter(self.weight.to(device), requires_grad=False) self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False) self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None def forward(self, input): output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width) if self.bias is not None: output = output + self.bias return output def quantize(model, weight_bit_width, empty_init=False, device=None): """Replace fp16 linear with quantized linear""" for layer in model.layers: layer.self_attention.query_key_value = QuantizedLinear( weight_bit_width=weight_bit_width, weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()), bias=layer.self_attention.query_key_value.bias, dtype=layer.self_attention.query_key_value.weight.dtype, device=layer.self_attention.query_key_value.weight.device if device is None else device, empty_init=empty_init ) layer.self_attention.dense = QuantizedLinear( weight_bit_width=weight_bit_width, weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()), bias=layer.self_attention.dense.bias, dtype=layer.self_attention.dense.weight.dtype, device=layer.self_attention.dense.weight.device if device is None else device, empty_init=empty_init ) layer.mlp.dense_h_to_4h = QuantizedLinear( weight_bit_width=weight_bit_width, weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), bias=layer.mlp.dense_h_to_4h.bias, dtype=layer.mlp.dense_h_to_4h.weight.dtype, device=layer.mlp.dense_h_to_4h.weight.device if device is None else device, empty_init=empty_init ) layer.mlp.dense_4h_to_h = QuantizedLinear( weight_bit_width=weight_bit_width, weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), bias=layer.mlp.dense_4h_to_h.bias, dtype=layer.mlp.dense_4h_to_h.weight.dtype, device=layer.mlp.dense_4h_to_h.weight.device if device is None else device, empty_init=empty_init ) return model ================================================ FILE: src/resources/models/chatglm3-6B/tokenization_chatglm.py ================================================ import json import os import torch from typing import List, Optional, Union, Dict from sentencepiece import SentencePieceProcessor from transformers import PreTrainedTokenizer from transformers.utils import logging, PaddingStrategy from transformers.tokenization_utils_base import EncodedInput, BatchEncoding class SPTokenizer: def __init__(self, model_path: str): # reload tokenizer assert os.path.isfile(model_path), model_path self.sp_model = SentencePieceProcessor(model_file=model_path) # BOS / EOS token IDs self.n_words: int = self.sp_model.vocab_size() self.bos_id: int = self.sp_model.bos_id() self.eos_id: int = self.sp_model.eos_id() # self.pad_id: int = self.sp_model.unk_id() self.pad_id: int = -100 assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] self.special_tokens = {} self.index_special_tokens = {} for token in special_tokens: self.special_tokens[token] = self.n_words self.index_special_tokens[self.n_words] = token self.n_words += 1 def tokenize(self, s: str): return self.sp_model.EncodeAsPieces(s) def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: assert type(s) is str t = self.sp_model.encode(s) if bos: t = [self.bos_id] + t if eos: t = t + [self.eos_id] return t def decode(self, t: List[int]) -> str: text, buffer = "", [] for token in t: if token in self.index_special_tokens: if buffer: text += self.sp_model.decode(buffer) buffer = [] text += self.index_special_tokens[token] else: buffer.append(token) if buffer: text += self.sp_model.decode(buffer) return text def decode_tokens(self, tokens: List[str]) -> str: text = self.sp_model.DecodePieces(tokens) return text def convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token in self.special_tokens: return self.special_tokens[token] return self.sp_model.PieceToId(token) def convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index in self.index_special_tokens: return self.index_special_tokens[index] if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: return "" return self.sp_model.IdToPiece(index) class ChatGLMTokenizer(PreTrainedTokenizer): vocab_files_names = {"vocab_file": "tokenizer.model"} model_input_names = ["input_ids", "attention_mask", "position_ids"] def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): self.name = "GLMTokenizer" self.vocab_file = vocab_file self.tokenizer = SPTokenizer(vocab_file) self.special_tokens = { "": self.tokenizer.bos_id, "": self.tokenizer.eos_id, "": self.tokenizer.pad_id } super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) def get_command(self, token): if token in self.special_tokens: return self.special_tokens[token] assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" return self.tokenizer.special_tokens[token] @property def unk_token(self) -> str: return "" @property def pad_token(self) -> str: return "" @property def pad_token_id(self): return self.get_command("") @property def eos_token(self) -> str: return "" @property def eos_token_id(self): return self.get_command("") @property def vocab_size(self): return self.tokenizer.n_words def get_vocab(self): """ Returns vocab as a dict """ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text, **kwargs): return self.tokenizer.tokenize(text) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.tokenizer.convert_token_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.tokenizer.convert_id_to_token(index) def convert_tokens_to_string(self, tokens: List[str]) -> str: return self.tokenizer.decode_tokens(tokens) def save_vocabulary(self, save_directory, filename_prefix=None): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. filename_prefix (`str`, *optional*): An optional prefix to add to the named of the saved files. Returns: `Tuple(str)`: Paths to the files saved. """ if os.path.isdir(save_directory): vocab_file = os.path.join( save_directory, self.vocab_files_names["vocab_file"] ) else: vocab_file = save_directory with open(self.vocab_file, 'rb') as fin: proto_str = fin.read() with open(vocab_file, "wb") as writer: writer.write(proto_str) return (vocab_file,) def get_prefix_tokens(self): prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] return prefix_tokens def build_single_message(self, role, metadata, message): assert role in ["system", "user", "assistant", "observation"], role role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") message_tokens = self.tokenizer.encode(message) tokens = role_tokens + message_tokens return tokens def build_chat_input(self, query, history=None, role="user"): if history is None: history = [] input_ids = [] for item in history: content = item["content"] if item["role"] == "system" and "tools" in item: content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) input_ids.extend(self.build_single_message(role, "", query)) input_ids.extend([self.get_command("<|assistant|>")]) return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: `[CLS] X [SEP]` - pair of sequences: `[CLS] A [SEP] B [SEP]` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ prefix_tokens = self.get_prefix_tokens() token_ids_0 = prefix_tokens + token_ids_0 if token_ids_1 is not None: token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] return token_ids_0 def _pad( self, encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ Pad encoded inputs (on left/right and up to predefined length or max length in the batch) Args: encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). max_length: maximum length of the returned list and optionally padding length (see below). Will truncate by taking into account the special tokens. padding_strategy: PaddingStrategy to use for padding. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - PaddingStrategy.DO_NOT_PAD: Do not pad The tokenizer padding sides are defined in self.padding_side: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ # Load from model defaults assert self.padding_side == "left" required_input = encoded_inputs[self.model_input_names[0]] seq_length = len(required_input) if padding_strategy == PaddingStrategy.LONGEST: max_length = len(required_input) if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length # Initialize attention mask if not present. if "attention_mask" not in encoded_inputs: encoded_inputs["attention_mask"] = [1] * seq_length if "position_ids" not in encoded_inputs: encoded_inputs["position_ids"] = list(range(seq_length)) if needs_to_be_padded: difference = max_length - len(required_input) if "attention_mask" in encoded_inputs: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "position_ids" in encoded_inputs: encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input return encoded_inputs ================================================ FILE: src/resources/models/chatglm3-6B/tokenizer_config.json ================================================ { "name_or_path": "THUDM/chatglm3-6b", "remove_space": false, "do_lower_case": false, "tokenizer_class": "ChatGLMTokenizer", "auto_map": { "AutoTokenizer": [ "tokenization_chatglm.ChatGLMTokenizer", null ] } } ================================================ FILE: src/resources/models/glm-10B-chinese/README.md ================================================ --- language: - zh tags: - glm - chatgpt --- Link to github: [here](https://github.com/sunzeyeah/RLHF) --- 本仓库由[THUDM/glm-10b-chinese](https://huggingface.co/THUDM/glm-10b-chinese) fork而来,原仓库实现了PyTorch版本的GLM模型,该模型有100亿参数量,模型权重文件以FP32格式存储。 本仓库在原始代码的基础上进行了部分调整,以支持ChatGPT训练pipeline,具体实现可参考:[sunzeyeah/RLHF](https://github.com/sunzeyeah/RLHF). This repository is forked from [THUDM/glm-10b-chinese](https://huggingface.co/THUDM/glm-10b-chinese) that contains PyTorch implementation of GLM model with 10 billion parameters pretrained weights (FP32 precision). It is slightly different from the original GLM implementation to support the ChatGPT training pipeline in this github repo: [sunzeyeah/RLHF](https://github.com/sunzeyeah/RLHF). --- # Model description GLM is a General Language Model pretrained with an autoregressive blank-filling objective and can be finetuned on various natural language understanding and generation tasks. Please refer to our paper for a detailed description of GLM: [GLM: General Language Model Pretraining with Autoregressive Blank Infilling](https://arxiv.org/abs/2103.10360) (ACL 2022) Zhengxiao Du*, Yujie Qian*, Xiao Liu, Ming Ding, Jiezhong Qiu, Zhilin Yang, Jie Tang (*: equal contribution) Find more examples in our [Github repo](https://github.com/THUDM/GLM). `glm-10b-chinese` is pretrained on the [WuDaoCorpora](https://www.sciencedirect.com/science/article/pii/S2666651021000152) dataset. It has 48 transformer layers, with hidden size 4096 and 64 attention heads in each layer. The model is pretrained with autoregressive blank filling objectives designed for natural language understanding, seq2seq, and language modeling. --- # Usage (Text Generation) ```python from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("sunzeyeah/glm-10B-chinese", trust_remote_code=True) model = AutoModelForSeq2SeqLM.from_pretrained("sunzeyeah/glm-10B-chinese", trust_remote_code=True) model = model.half().cuda() max_length = 512 prompt = "我不能确定对方是不是喜欢我,我却想分分秒秒跟他在一起,有谁能告诉我如何能想他少一点" prefix = "回答:" encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token) prompt_length = len(encoded_prompt['input_ids']) encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token, max_length=min(prompt_length, max_length), truncation="only_first", return_tensors="pt", return_token_type_ids=False) max_gen_length = max_length - encoded_dict['input_ids'].shape[1] inputs = tokenizer.build_inputs_for_generation(encoded_dict, max_gen_length=max_gen_length, padding=True) inputs = inputs.cuda() outputs = model.generate(**inputs, max_new_tokens=max_gen_length, eos_token_id=tokenizer.eop_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=False, num_return_sequences=1, top_p=0.8, temperature=1.0) results = tokenizer.batch_decode(outputs, skip_special_tokens=True) print(results) ``` ================================================ FILE: src/resources/models/glm-10B-chinese/added_tokens.json ================================================ { "<|endoftext|>": 50000, "[SEP]": 50001, "[CLS]": 50002, "[MASK]": 50003, "[UNUSED1]": 50004, "[UNUSED2]": 50005, "<|startofpiece|>": 50006, "<|endofpiece|>": 50007, "[sMASK]": 50008, "[gMASK]": 50009 } ================================================ FILE: src/resources/models/glm-10B-chinese/config.json ================================================ { "model_name_or_path": "sunzeyeah/glm-10B-chinese", "architectures": [ "GLMModel" ], "attention_dropout_prob": 0.1, "attention_scale": 1.0, "auto_map": { "AutoConfig": "configuration_glm.GLMConfig", "AutoModel": "modeling_glm.GLMModel", "AutoModelForSequenceClassification": "modeling_glm.GLMForSequenceClassification", "AutoModelForSeq2SeqLM": "modeling_glm.GLMForConditionalGeneration" }, "block_position_encoding": true, "checkpoint_activations": false, "checkpoint_num_layers": 1, "embedding_dropout_prob": 0.1, "hidden_size": 4096, "initializer_range": 0.02, "max_sequence_length": 1024, "model_type": "glm", "num_attention_heads": 64, "num_layers": 48, "output_dropout_prob": 0.1, "output_predict": true, "parallel_output": true, "pool_token": "cls", "relative_encoding": false, "spell_func": "lstm", "spell_length": null, "torch_dtype": "float32", "vocab_size": 50048, "bad_words_id": [50009], "pad_token_id": 50000 } ================================================ FILE: src/resources/models/glm-10B-chinese/configuration_glm.py ================================================ # coding=utf-8 # Copyright 2022 shunxing1234 and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ GLM model configuration """ from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) GLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { "shunxing1234/GLM": "https://huggingface.co/shunxing1234/GLM/resolve/main/config.json", # See all GLM models at https://huggingface.co/models?filter=glm } class GLMConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`~GLMModel`]. It is used to instantiate an GLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the GLM [shunxing1234/GLM-base-cased](https://huggingface.co/shunxing1234/GLM-base-cased) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 30522): Vocabulary size of the GLM model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`~GLMModel`] or [`~TFGLMModel`]. hidden_size (`int`, *optional*, defaults to 768): Dimension of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 12): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (`int`, *optional*, defaults to 3072): Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. hidden_dropout_prob (`float`, *optional*, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): The dropout ratio for the attention probabilities. max_position_embeddings (`int`, *optional*, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (`int`, *optional*, defaults to 2): The vocabulary size of the `token_type_ids` passed when calling [`~GLMModel`] or [`~TFGLMModel`]. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. Example: ```python >>> from transformers import GLMModel, GLMConfig >>> # Initializing a GLM shunxing1234/GLM-base-cased style configuration >>> configuration = GLMConfig() >>> # Initializing a model from the shunxing1234/GLM-base-cased style configuration >>> model = GLMModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ``` """ model_type = "glm" attribute_map = { "num_hidden_layers": "num_layers" } def __init__( self, num_layers=24, vocab_size=30592, hidden_size=1024, num_attention_heads=16, embedding_dropout_prob=0.1, attention_dropout_prob=0.1, output_dropout_prob=0.1, max_sequence_length=512, checkpoint_activations=False, checkpoint_num_layers=1, parallel_output=True, relative_encoding=False, block_position_encoding=True, output_predict=False, spell_length=None, spell_func="lstm", attention_scale=1.0, initializer_range=0.02, pool_token="cls", **kwargs ): self.num_layers = num_layers self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.embedding_dropout_prob = embedding_dropout_prob self.attention_dropout_prob = attention_dropout_prob self.output_dropout_prob = output_dropout_prob self.max_sequence_length = max_sequence_length self.checkpoint_activations = checkpoint_activations self.checkpoint_num_layers = checkpoint_num_layers self.parallel_output = parallel_output self.relative_encoding = relative_encoding self.block_position_encoding = block_position_encoding self.output_predict = output_predict self.spell_length = spell_length self.spell_func = spell_func self.attention_scale = attention_scale self.initializer_range = initializer_range self.pool_token = pool_token super().__init__(**kwargs) ================================================ FILE: src/resources/models/glm-10B-chinese/modeling_glm.py ================================================ # coding=utf-8 # Copyright 2022 shunxing1234 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch GLM model. """ import math import torch import torch.utils.checkpoint import torch.nn.functional as F from torch.nn import init, LayerNorm, Linear, CrossEntropyLoss from transformers.activations import gelu from transformers.utils import ( add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, ) from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, ModelOutput, SequenceClassifierOutput, ) from transformers.modeling_utils import ( PreTrainedModel, ) from .configuration_glm import GLMConfig from torch.nn.parameter import Parameter _CHECKPOINT_FOR_DOC = "shunxing1234/GLM" _CONFIG_FOR_DOC = "GLMConfig" _TOKENIZER_FOR_DOC = "GLMTokenizer" GLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ "shunxing1234/GLM", # See all GLM models at https://huggingface.co/models?filter=glm ] def unscaled_init_method(sigma): """Init method based on N(0, sigma).""" def init_(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) return init_ def scaled_init_method(mean, std, num_layers): """Init method based on N(0, sigma/sqrt(2*num_layers).""" std = std / math.sqrt(2.0 * num_layers) def init_(tensor): return torch.nn.init.normal_(tensor, mean=mean, std=std) return init_ def ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" assert numerator % denominator == 0, '{} is not divisible by {}'.format( numerator, denominator) def divide(numerator, denominator): """Ensure that numerator is divisible by the denominator and return the division value.""" ensure_divisibility(numerator, denominator) return numerator // denominator def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False): """Split a tensor along its last dimension. Arguments: tensor: input tensor. num_partitions: number of partitions to split the tensor contiguous_split_chunks: If True, make each chunk contiguous in memory. """ # Get the size and dimension. last_dim = tensor.dim() - 1 last_dim_size = divide(tensor.size()[last_dim], num_partitions) # Split. tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) # Note: torch.split does not create contiguous tensors by default. if contiguous_split_chunks: return tuple(chunk.contiguous() for chunk in tensor_list) return tensor_list class MLP(torch.nn.Module): """MLP for GPT2. MLP will take the input with h hidden state, project it to 4*h hidden dimension, perform gelu transformation, and project the state back into h hidden dimension. At the end, dropout is also applied. Arguments: hidden_size: The hidden size of the self attention. output_dropout_prob: dropout probability for the outputs after self attention and final output. init_method: initialization method used for the weights. Note that all biases are initialized to zero and layernorm weight are initialized to one. output_layer_init_method: output layer initialization. If None, use `init_method`. """ def __init__(self, hidden_size, output_dropout_prob, init_method, output_layer_init_method=None): super(MLP, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Project to 4h. self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size) # Project back to h. self.dense_4h_to_h = Linear( 4 * hidden_size, hidden_size) self.dropout = torch.nn.Dropout(output_dropout_prob) def forward(self, hidden_states): # [b, s, 4hp] intermediate_parallel = self.dense_h_to_4h(hidden_states) intermediate_parallel = gelu(intermediate_parallel) # [b, s, h] output = self.dense_4h_to_h(intermediate_parallel) output = self.dropout(output) return output class VocabEmbedding(torch.nn.Module): """Embedding parallelized in the vocabulary dimension. This is mainly adapted from torch.nn.Embedding and all the default values are kept. Arguments: num_embeddings: vocabulary size. embedding_dim: size of hidden state. init_method: method to initialize weights. """ def __init__(self, config): super(VocabEmbedding, self).__init__() # Keep the input dimensions. self.num_embeddings = config.vocab_size self.embedding_dim = config.hidden_size # Set the detauls for compatibility. self.padding_idx = None self.max_norm = None self.norm_type = 2. self.scale_grad_by_freq = False self.sparse = False self._weight = None self.vocab_start_index = 0 self.vocab_end_index = self.num_embeddings # Allocate weights. weight = torch.Tensor(self.num_embeddings, self.embedding_dim) self.register_buffer("weight", weight) # And initialize. init.xavier_normal_(self.weight) def forward(self, input_): # Get the embeddings. output = F.embedding(input_, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse) return output class PositionalEmbedding(torch.nn.Module): def __init__(self, hidden_size): super(PositionalEmbedding, self).__init__() self.hidden_size = hidden_size inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size)) self.register_buffer('inv_freq', inv_freq) def forward(self, pos_seq, bsz=None): sinusoid_inp = torch.ger(pos_seq, self.inv_freq) pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) if bsz is not None: return pos_emb[None, :, :].expand(bsz, -1, -1) else: return pos_emb[None, :, :] class SelfAttention(torch.nn.Module): """self-attention layer for GLM. Self-attention layer takes input with size [b, s, h] where b is the batch size, s is the sequence lenght, and h is the hidden size and creates output of the same size. Arguments: hidden_size: total hidden size of the layer (h). num_attention_heads: number of attention heads (n). Note that we require n to be divisible by number of GPUs used to parallelize the model. Also, we require hidden size to be divisible by n. attention_dropout_prob: dropout probability for the attention scores. init_method: weight initialization. output_layer_init_method: output layer initialization. If None, use `init_method`. We use the following notation: h: hidden_size n: num_attention_heads p: number of partitions np: n/p hp: h/p hn: h/n b: batch size s: sequence length """ def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=None, attention_scale=1.0): super(SelfAttention, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Per attention head and per partition values. self.hidden_size = hidden_size self.hidden_size_per_attention_head = divide(hidden_size, num_attention_heads) self.num_attention_heads = num_attention_heads self.attention_scale = attention_scale # Strided linear layer. self.query_key_value = Linear(hidden_size, 3 * hidden_size) # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but # on average it should not be partition dependent. self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) # Output. self.dense = Linear(hidden_size, hidden_size) self.output_dropout = torch.nn.Dropout(output_dropout_prob) def _transpose_for_scores(self, tensor): """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with size [b, np, s, hn]. """ new_tensor_shape = tensor.size()[:-1] + \ (self.num_attention_heads, self.hidden_size_per_attention_head) tensor = tensor.view(*new_tensor_shape) return tensor.permute(0, 2, 1, 3) def forward(self, hidden_states, ltor_mask, mem=None): # hidden_states: [b, s, h] # ltor_mask: [b,1,s,s] # Attention heads. [b, s, hp] query_length = hidden_states.size(1) # self attention if mem is None: mixed_x_layer = self.query_key_value(hidden_states) (mixed_query_layer, mixed_key_layer, mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) else: cat = torch.cat((mem, hidden_states), 1) mixed_x_layer = self.query_key_value(cat) (mixed_query_layer, mixed_key_layer, mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) mixed_query_layer = mixed_query_layer[:, -query_length:] # Reshape and transpose [b, np, s, hn] query_layer = self._transpose_for_scores(mixed_query_layer) key_layer = self._transpose_for_scores(mixed_key_layer) value_layer = self._transpose_for_scores(mixed_value_layer) if self.attention_scale > 1.0: # Raw attention scores. [b, np, s, s] attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_scale), key_layer.transpose(-1, -2) / math.sqrt( self.hidden_size_per_attention_head * self.attention_scale)) else: attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2) / math.sqrt( self.hidden_size_per_attention_head)) # Apply the left to right attention mask. ltor_mask = ltor_mask.type_as(attention_scores) attention_scores = torch.mul(attention_scores, ltor_mask) if self.attention_scale > 1.0: max_attention_scores = attention_scores.max(dim=-1, keepdim=True)[0] attention_scores -= max_attention_scores attention_scores *= self.attention_scale attention_scores = attention_scores + (-65504.0) * (1.0 - ltor_mask) # Attention probabilities. [b, np, s, s] attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. # with get_cuda_rng_tracker().fork(): attention_probs = self.attention_dropout(attention_probs) # Context layer. # [b, np, s, hn] context_layer = torch.matmul(attention_probs, value_layer) # [b, s, np, hn] context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + \ (self.hidden_size,) # [b, s, hp] context_layer = context_layer.view(*new_context_layer_shape) # Output. [b, s, h] output = self.dense(context_layer) output = self.output_dropout(output) return output class GLMBlock(torch.nn.Module): """A single layer transformer for GLM. We use the following notation: h: hidden size n: number of attention heads b: batch size s: sequence length Transformore layer takes input with size [b, s, h] and returns an output of the same size. Arguments: hidden_size: The hidden size of the self attention. num_attention_heads: number of attention head in the self attention. attention_dropout_prob: dropout probability of the attention score in self attention. output_dropout_prob: dropout probability for the outputs after self attention and final output. layernorm_epsilon: epsilon used in layernorm to avoid division by zero. init_method: initialization method used for the weights. Note that all biases are initialized to zero and layernorm weight are initialized to one. output_layer_init_method: output layers (attention output and mlp output) initialization. If None, use `init_method`. """ def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, init_method, output_layer_init_method=None, attention_scale=1.0): super(GLMBlock, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Layernorm on the input data. self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # Self attention. self.attention = SelfAttention( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method, attention_scale=attention_scale) # Layernorm on the input data. self.post_attention_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # MLP self.mlp = MLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) def forward(self, hidden_states, ltor_mask, mem=None): # hidden_states: [b, s, h] # ltor_mask: [b,1, s,s] # Layer norm at the begining of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) mem = self.input_layernorm(mem) if mem is not None else None # Self attention. attention_output = self.attention(layernorm_output, ltor_mask, mem) # Residual connection. layernorm_input = hidden_states + attention_output # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) # MLP. mlp_output = self.mlp(layernorm_output) # Second residual connection. output = layernorm_input + mlp_output return output class GLMStack(torch.nn.Module): """GLM transformer. This module takes input from embedding layer and it's output can be used directly by a logit layer. It consists of L (num-layers) blocks of: layer norm self attention residual connection layer norm mlp residual connection followed by a final layer norm. Arguments: num_layers: Number of transformer layers. hidden_size: The hidden size of the self attention. num_attention_heads: number of attention head in the self attention. attention_dropout_prob: dropout probability of the attention score in self attention. output_dropout_prob: dropout probability for the outputs after self attention and final output. checkpoint_activations: if True, checkpoint activations. checkpoint_num_layers: number of layers to checkpoint. This is basically the chunk size in checkpoitning. layernorm_epsilon: epsilon used in layernorm to avoid division by zero. init_method_std: standard deviation of the init method which has the form N(0, std). use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers) scaling for the output weights ( output of self attention and mlp). """ def __init__(self, num_layers, hidden_size, num_attention_heads, max_sequence_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers=1, layernorm_epsilon=1.0e-5, init_method_std=0.02, use_scaled_init_for_output_weights=True, block_position_encoding=False, attention_scale=1.0, ): super(GLMStack, self).__init__() self.hidden_size = hidden_size # Store activation checkpoiting flag. self.checkpoint_activations = checkpoint_activations self.checkpoint_num_layers = checkpoint_num_layers output_layer_init_method = None if use_scaled_init_for_output_weights: output_layer_init_method = scaled_init_method(0.0, init_method_std, num_layers) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) self.block_position_encoding = block_position_encoding # Position embedding (serial). if block_position_encoding: self.position_embeddings = torch.nn.Embedding(max_sequence_length + 1, hidden_size) self.block_position_embeddings = torch.nn.Embedding(max_sequence_length + 1, hidden_size) torch.nn.init.normal_(self.block_position_embeddings.weight, mean=0.0, std=init_method_std) else: self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Initialize the position embeddings. torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std) def get_layer(): return GLMBlock( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, unscaled_init_method(init_method_std), output_layer_init_method=output_layer_init_method, attention_scale=attention_scale) # Transformer layers. self.layers = torch.nn.ModuleList( [get_layer() for _ in range(num_layers)]) # Final layer norm before output. self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) def forward(self, hidden_states, position_ids, attention_mask, memory_states=None): batch_size, query_length = hidden_states.size()[:2] memory_length = memory_states[0].size(1) if memory_states else 0 # attention mask is the beginning postion of B region, \in [0, query_len) is_scalar = torch.numel(attention_mask) == 1 is_sep = is_scalar or torch.numel(attention_mask) == batch_size if is_sep: sep = attention_mask.item() if is_scalar else attention_mask # conventional transformer def build_mask_matrix(seq_length, sep, memory_length=0): m = hidden_states.new_ones((1, seq_length, seq_length)) m = torch.tril(m) if is_scalar: m[0, :, :int(sep)] = 1 else: m = m.expand(batch_size, -1, -1) ids = torch.arange(seq_length, device=sep.device, dtype=sep.dtype).view(1, -1) mask = ids < sep.view(-1, 1) m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1) if memory_length > 0: m = m.expand(batch_size, -1, -1) m = torch.cat((hidden_states.new_ones((batch_size, seq_length, memory_length)), m), dim=2) m = m.unsqueeze(1) return m attention_mask = build_mask_matrix(query_length, sep, memory_length=memory_length) else: if attention_mask.dim() == 2: attention_mask = attention_mask.unsqueeze(1).unsqueeze(1) attention_mask = attention_mask[:, :, :, -query_length - memory_length:] if self.block_position_encoding: position_ids, block_position_ids = position_ids[:, 0], position_ids[:, 1] position_embeddings = self.position_embeddings(position_ids) hidden_states = hidden_states + position_embeddings if self.block_position_encoding: block_position_embeddings = self.block_position_embeddings(block_position_ids) hidden_states = hidden_states + block_position_embeddings hidden_states = self.embedding_dropout(hidden_states) def check_detach(_hidden_states): return _hidden_states.detach() mem_layers = [check_detach(hidden_states)] for i, layer in enumerate(self.layers): args = [hidden_states, attention_mask] def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs) return custom_forward mem_i = memory_states[i] if memory_states else None if self.checkpoint_activations: hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, mem=mem_i, ) else: hidden_states = layer(*args, mem=mem_i) mem_layers.append(check_detach(hidden_states)) # Final layer norm. output = self.final_layernorm(hidden_states) mem_layers = self.update_mems(mem_layers, memory_states) return (output, mem_layers) def update_mems(self, hiddens, mems): memory_length = mems[0].size(1) if mems else 0 query_length = hiddens[0].size(1) new_memory_length = memory_length + query_length new_mems = [] # with torch.no_grad(): for i in range(len(hiddens)): if new_memory_length <= query_length: new_mems.append(hiddens[i][:, -new_memory_length:]) else: new_mems.append(torch.cat((mems[i][:, -new_memory_length + query_length:], hiddens[i]), dim=1)) return new_mems class GLMPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = GLMConfig base_model_prefix = "glm" supports_gradient_checkpointing = True _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, torch.nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, torch.nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, torch.nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, GLMModel): module.gradient_checkpointing = value GLM_START_DOCSTRING = r""" This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config ([`~GLMConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ GLM_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`GLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`: - 0 corresponds to a *sentence A* token, - 1 corresponds to a *sentence B* token. [What are token type IDs?](../glossary#token-type-ids) position_ids (`torch.LongTensor` of shape `({0})`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. [What are position IDs?](../glossary#position-ids) head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @add_start_docstrings( "The bare GLM Model transformer outputting raw hidden-states without any specific head on top.", GLM_START_DOCSTRING, ) class GLMModel(GLMPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in [Attention is all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ def __init__(self, config): super().__init__(config) self.config = config self.output_predict = config.output_predict # Word embeddings (parallel). self.word_embeddings = VocabEmbedding(config) # Transformer self.transformer = GLMStack(config.num_layers, config.hidden_size, config.num_attention_heads, config.max_sequence_length, config.embedding_dropout_prob, config.attention_dropout_prob, config.output_dropout_prob, config.checkpoint_activations, config.checkpoint_num_layers, attention_scale=config.attention_scale, block_position_encoding=config.block_position_encoding) # Initialize weights and apply final processing self.post_init() @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, position_ids=None, attention_mask=None, mems=None, **kwargs ): batch_size = input_ids.size(0) words_embeddings = self.word_embeddings(input_ids) embeddings = words_embeddings device = input_ids.device input_shape = input_ids.size() if position_ids is None: position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device) block_position_ids = torch.zeros(input_shape[-1], dtype=torch.long, device=device) position_ids = torch.stack((position_ids, block_position_ids), dim=0).unsqueeze(0) if attention_mask is None: attention_mask = torch.zeros(batch_size) # Transformer. transformer_output = self.transformer(embeddings, position_ids, attention_mask, mems) last_hidden_states, mems = transformer_output logits = None if self.output_predict: logits = F.linear(last_hidden_states, self.word_embeddings.weight) return ModelOutput( last_hidden_states=last_hidden_states, logits=logits, mems=mems, ) @add_start_docstrings( """GLM Model transformer for multiple choice classification""", GLM_START_DOCSTRING ) class GLMForMultipleChoice(GLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.glm = GLMModel(config) self.post_init() def forward( self, input_ids=None, position_ids=None, attention_mask=None, choice_ids=None, choice_indices=None, labels=None, mems=None, **kwargs ): model_output = self.glm(input_ids, position_ids, attention_mask, mems=mems, **kwargs) lm_logits = model_output.logits log_probs = [] for output, choices, choice_index in zip(F.log_softmax(lm_logits, dim=-1), choice_ids, choice_indices): log_probs_single = [] for choice, choice_target_id in zip(choices, choice_index): tmp = output[choice_target_id, choice] log_probs_single.append(tmp.sum()) log_probs.append(torch.stack(log_probs_single)) log_probs = torch.stack(log_probs) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(log_probs, labels) return ModelOutput( loss=loss, logits=log_probs, lm_logits=lm_logits, mems=model_output.mems ) @add_start_docstrings( """GLM Model transformer with a `language modeling` head on top""", GLM_START_DOCSTRING, ) class GLMForConditionalGeneration(GLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.glm = GLMModel(config) self.post_init() def _reorder_cache(self, past, beam_idx): # if decoder past is not included in output # speedy decoding is disabled and no need to reorder if past is None: return past reordered_decoder_past = () for layer_past_states in past: # get the correct batch idx from layer past batch dim reordered_decoder_past = reordered_decoder_past + ( layer_past_states.index_select(0, beam_idx.to(layer_past_states.device)),) return reordered_decoder_past def prepare_inputs_for_generation(self, input_ids, past=None, position_ids=None, generation_attention_mask=None, **kwargs): # only last token for inputs_ids if past is defined in kwargs attention_mask = generation_attention_mask seq_length = input_ids.shape[1] if past: if position_ids is not None: position_ids = position_ids[:, :, seq_length - 1].unsqueeze(-1) if attention_mask is not None: attention_mask = attention_mask[:, :, seq_length - 1, :seq_length].unsqueeze(-2) input_ids = input_ids[:, -1].unsqueeze(-1) else: if position_ids is not None: position_ids = position_ids[:, :, :seq_length] if attention_mask is not None: attention_mask = attention_mask[:, :, :seq_length, :seq_length] if position_ids is not None and input_ids.size(0) > position_ids.size(0): batch_size = position_ids.size(0) num_beams = input_ids.size(0) // batch_size position_ids = position_ids.unsqueeze(1).expand(-1, num_beams, -1, -1) position_ids = position_ids.reshape(batch_size * num_beams, *position_ids.shape[-2:]) if attention_mask is not None and input_ids.size(0) > attention_mask.size(0): batch_size = attention_mask.size(0) num_beams = input_ids.size(0) // batch_size attention_mask = attention_mask.unsqueeze(1).expand(-1, num_beams, -1, -1, -1) attention_mask = attention_mask.reshape(batch_size * num_beams, *attention_mask.shape[-3:]) return { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask, "mems": past, } def forward( self, input_ids=None, position_ids=None, attention_mask=None, labels=None, mems=None, **kwargs ): model_output = self.glm(input_ids, position_ids, attention_mask, mems=mems, **kwargs) lm_logits = model_output.logits loss = None if labels is not None: loss_fct = CrossEntropyLoss(ignore_index=self.config.pad_token_id) loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) return ModelOutput( loss=loss, logits=lm_logits, mems=model_output.mems ) @add_start_docstrings( """GLM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, GLM_START_DOCSTRING, ) class GLMForSequenceClassification(GLMPreTrainedModel): def __init__(self, config: GLMConfig, hidden_dropout=None, num_class=1): super().__init__(config) self.pool_token = config.pool_token self.glm = GLMModel(config) self.glm.output_predict = False self.num_class = num_class # Multi-choice head. self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size) classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.output_dropout_prob ) self.dropout = torch.nn.Dropout(classifier_dropout) self.out_proj = torch.nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing self.post_init() @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward(self, input_ids=None, position_ids=None, attention_mask=None, labels=None): num_choices = None if len(input_ids.shape) == 3: batch_size, num_choices = input_ids.shape[:2] input_ids = input_ids.reshape(-1, input_ids.size(-1)) attention_mask = attention_mask.reshape(-1, *attention_mask.size()[2:]) position_ids = position_ids.reshape(-1, *position_ids.size()[2:]) model_out = self.glm(input_ids, position_ids, attention_mask) outputs, mems = model_out.last_hidden_states, model_out.mems output = outputs[:, 0, :] output = self.dropout(output) output = torch.tanh(self.dense(output)) output = self.dropout(output) logits = self.out_proj(output) if num_choices is not None: logits = logits.view(-1, num_choices) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits, labels) # loss = F.cross_entropy(logits.contiguous().float(), labels.long()) return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs) ================================================ FILE: src/resources/models/glm-10B-chinese/tokenization_glm.py ================================================ import os from typing import Optional, Tuple, List, Union from shutil import copyfile import torch from transformers import PreTrainedTokenizer, RobertaTokenizer, GPT2Tokenizer, BertTokenizer from transformers.utils import logging from transformers.tokenization_utils_base import BatchEncoding from transformers.models.auto.tokenization_auto import get_tokenizer_config # from transformers.utils import torch_required from transformers.utils.generic import _is_torch_device import sentencepiece as spm logger = logging.get_logger(__name__) class GLMBatchEncoding(BatchEncoding): # @torch_required def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": """ Send all values to device by calling `v.to(device)` (PyTorch only). Args: device (`str` or `torch.device`): The device to put the tensors on. Returns: [`BatchEncoding`]: The same instance after modification. """ # This check catches things like APEX blindly calling "to" on all inputs to a module # Otherwise it passes the casts down and casts the LongTensor containing the token idxs # into a HalfTensor if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int): self.data = {k: v.to(device=device) if torch.is_tensor(v) else v for k, v in self.data.items()} else: logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") return self class GLMTokenizerMixin: @property def sop_token(self) -> Optional[str]: return "<|startofpiece|>" @property def sop_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the start token in the vocabulary, used when training a model with autoregressive blank filling. """ return self.convert_tokens_to_ids(self.sop_token) @property def eop_token(self) -> Optional[str]: return "<|endofpiece|>" @property def eop_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the end token in the vocabulary, used when training a model with autoregressive blank filling. """ return self.convert_tokens_to_ids(self.eop_token) @property def gmask_token_id(self) -> int: return self.convert_tokens_to_ids("[gMASK]") @property def smask_token_id(self) -> int: return self.convert_tokens_to_ids("[sMASK]") @property def mask_token_ids(self): return [self.mask_token_id, self.smask_token_id, self.gmask_token_id] def _build_input_for_multiple_choice(self, context, choices): context_id = context["input_ids"] if torch.is_tensor(context_id): context_id = context_id.tolist() division = len(context_id) mask_position = context_id.index(self.mask_token_id) token = torch.tensor(context_id, dtype=torch.long) attention_mask = [context["attention_mask"].expand(division, -1)] position_id = torch.arange(division, dtype=torch.long) block_position_id = torch.zeros(division, dtype=torch.long) choice_ids, choice_indices = [], [] for choice_str in choices: choice = torch.tensor(self(choice_str, add_special_tokens=False, padding=False)['input_ids'], dtype=torch.long) choice_ids.append(choice) choice_indices.append(torch.arange(len(token), len(token) + len(choice), dtype=torch.long)) attention_mask.append(torch.tril(torch.ones((len(choice), len(choice)), dtype=torch.long))) token = torch.cat((token, torch.tensor([self.sop_token_id], dtype=torch.long), choice[:-1])) position_id = torch.cat((position_id, torch.tensor([mask_position] * len(choice), dtype=torch.long))) block_position_id = torch.cat((block_position_id, torch.arange(1, 1 + len(choice), dtype=torch.long))) attention_mask = torch.block_diag(*attention_mask) attention_mask[division:, :division] = context["attention_mask"].unsqueeze(0) return { "input_ids": token, "position_ids": torch.stack((position_id, block_position_id)), "attention_mask": attention_mask, "choice_ids": choice_ids, "choice_indices": choice_indices } def _pad_batch(self, tokens, position_ids, attention_mask, max_seq_length): pad_length = max_seq_length - len(tokens) attention_mask = torch.nn.functional.pad( attention_mask, (0, pad_length, 0, pad_length), mode="constant", value=0, ) tokens = torch.cat((tokens, torch.zeros(pad_length, dtype=torch.long))) position_ids = torch.cat((position_ids, position_ids[..., -1:].expand(-1, pad_length)), dim=-1) return tokens, position_ids, attention_mask def _collate(self, samples): TILE = 1 length_to_pad = (max(map(lambda spl: len(spl["input_ids"]), samples)) + TILE - 1) // TILE * TILE token_batch, position_id_batch, attention_mask_batch = [], [], [] choices_batch, choice_target_ids_batch = [], [] for sample in samples: token, position_id, attention_mask = self._pad_batch( sample["input_ids"], sample["position_ids"], sample["attention_mask"], length_to_pad ) token_batch.append(token) position_id_batch.append(position_id) attention_mask_batch.append(attention_mask) choices_batch.append(sample["choice_ids"]) choice_target_ids_batch.append(sample["choice_indices"]) return { "input_ids": torch.stack(token_batch), "position_ids": torch.stack(position_id_batch), "attention_mask": torch.stack(attention_mask_batch).unsqueeze(1), "choice_ids": choices_batch, "choice_indices": choice_target_ids_batch, } def build_inputs_for_multiple_choice(self, model_input: BatchEncoding, choices, max_length=None): samples = [{key: value[i] for key, value in model_input.items()} for i in range(len(model_input["input_ids"]))] samples = [self._build_input_for_multiple_choice(sample, choice) for sample, choice in zip(samples, choices)] inputs = self._collate(samples) return GLMBatchEncoding(inputs) def build_inputs_for_generation(self, model_input: BatchEncoding, max_gen_length=512, targets=None, padding=False): mask_ids = self.mask_token_ids input_ids = model_input.input_ids batch_size, seq_length = input_ids.shape[:2] position_id, block_position_id = list(range(seq_length)), [0 for _ in range(seq_length)] position_ids, block_position_ids = [], [] labels = None if targets is not None: is_batched = isinstance(targets, (list, tuple)) targets = self(targets, add_special_tokens=False, padding=False).input_ids if not is_batched: targets = [targets] assert len(targets) == len(input_ids) targets = [target[:(max_gen_length-1)] + [self.eop_token_id] for target in targets] if not padding: max_gen_length = max(map(len, targets)) targets = [[self.sop_token_id] + target for target in targets] labels = [target[1:] for target in targets] targets = [target + [self.pad_token_id] * (max_gen_length + 1 - len(target)) for target in targets] labels = [label + [self.pad_token_id] * (max_gen_length - len(label)) for label in labels] targets = torch.tensor(targets, dtype=input_ids.dtype, device=input_ids.device) labels = torch.tensor(labels, dtype=input_ids.dtype, device=input_ids.device) labels = torch.cat((input_ids.new_full((batch_size, seq_length), self.pad_token_id), labels), dim=1) for i in range(batch_size): mask_positions = [] for mask_id in mask_ids: mask_positions += (input_ids[i] == mask_id).nonzero(as_tuple=True)[0].tolist() if not mask_positions: raise ValueError("Cannot find mask token in the input") mask_positions.sort() mask_pos = mask_positions[0] position_ids.append(position_id + [mask_pos] * max_gen_length) block_position_ids.append(block_position_id + list(range(1, max_gen_length + 1))) position_ids = torch.tensor(position_ids, dtype=input_ids.dtype, device=input_ids.device) block_position_ids = torch.tensor(block_position_ids, dtype=input_ids.dtype, device=input_ids.device) position_ids = torch.stack((position_ids, block_position_ids), dim=1) attention_mask = model_input.attention_mask attention_mask = attention_mask.unsqueeze(1).expand(-1, seq_length + max_gen_length, -1) generation_attention_mask = torch.cat([attention_mask.new_zeros((seq_length, max_gen_length)), torch.tril(attention_mask.new_ones((max_gen_length, max_gen_length)))], dim=0).unsqueeze(0).expand(batch_size, -1, -1) attention_mask = torch.cat((attention_mask, generation_attention_mask), dim=2) attention_mask = attention_mask.unsqueeze(1) if targets is None: input_ids = torch.cat((input_ids, input_ids.new_full((batch_size, 1), self.sop_token_id)), dim=-1) else: input_ids = torch.cat((input_ids, targets[:, :-1]), dim=1) batch = {"input_ids": input_ids, "position_ids": position_ids} if labels is None: batch["generation_attention_mask"] = attention_mask else: batch["attention_mask"] = attention_mask batch["labels"] = labels return BatchEncoding(batch) class GLMRobertaTokenizer(RobertaTokenizer, GLMTokenizerMixin): model_input_names = ["input_ids", "position_ids", "attention_mask"] truncation_side: str = "left" @property def gmask_token_id(self) -> int: raise NotImplementedError("The model doesn't support gMASK") @property def smask_token_id(self) -> int: raise NotImplementedError("The model doesn't support sMASK") @property def mask_token_ids(self): return [self.mask_token_id] class GLMChineseTokenizer(PreTrainedTokenizer, GLMTokenizerMixin): vocab_files_names = {"vocab_file": "cog-pretrain.model"} truncation_side: str = "left" def __init__(self, vocab_file, **kwargs): super().__init__(**kwargs) self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return len(self.sp_model) def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text, **kwargs): return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): return self.sp_model.decode(tokens) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"] ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) return (out_vocab_file,) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ #assert token_ids_1 is None #cls = [self.cls_token_id] #eos = [self.eos_token_id] #return cls + token_ids_0 + eos cls = [self.cls_token_id] eos = [self.eos_token_id] #eop = [self.eop_token_id] #mask = [self.mask_token_id] sep = [self.sep_token_id] #token_ids_0 = cls + token_ids_0 + mask + eos if token_ids_1 is None: return cls + token_ids_0 + eos else: return cls + token_ids_0 + sep + token_ids_1 + eos class GLMGPT2Tokenizer(GPT2Tokenizer, GLMTokenizerMixin): model_input_names = ["input_ids", "position_ids", "attention_mask"] truncation_side: str = "left" def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ assert token_ids_1 is None cls = [self.cls_token_id] eos = [self.eos_token_id] return cls + token_ids_0 + eos class GLMBertTokenizer(BertTokenizer, GLMTokenizerMixin): model_input_names = ["input_ids", "position_ids", "attention_mask"] truncation_side: str = "left" @property def gmask_token_id(self) -> int: raise NotImplementedError("The model doesn't support gMASK") @property def smask_token_id(self) -> int: raise NotImplementedError("The model doesn't support sMASK") @property def mask_token_ids(self): return [self.mask_token_id] class GLMTokenizer: @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) config_tokenizer_class = tokenizer_config.get("tokenizer_class") if config_tokenizer_class == "GLMRobertaTokenizer": tokenizer_class = GLMRobertaTokenizer elif config_tokenizer_class == "GLMChineseTokenizer": tokenizer_class = GLMChineseTokenizer elif config_tokenizer_class == "GLMGPT2Tokenizer": tokenizer_class = GLMGPT2Tokenizer elif config_tokenizer_class == "GLMBertTokenizer": tokenizer_class = GLMBertTokenizer else: raise NotImplementedError("Not implemented tokenizer type:", config_tokenizer_class) return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) ================================================ FILE: src/resources/models/glm-10B-chinese/tokenizer_config.json ================================================ { "name_or_path": "BAAI/glm-large-chinese", "eos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", "cls_token": "[CLS]", "mask_token": "[MASK]", "unk_token": "[UNK]", "sep_token": "[SEP]", "additional_special_tokens": ["<|startofpiece|>", "<|endofpiece|>", "[gMASK]", "[sMASK]"], "add_prefix_space": false, "tokenizer_class": "GLMChineseTokenizer", "use_fast": false, "auto_map": { "AutoTokenizer": [ "tokenization_glm.GLMChineseTokenizer", null ] } } ================================================ FILE: src/resources/models/glm-350M-chinese/README.md ================================================ --- language: - zh tags: - glm - chatgpt --- Link to github: [here](https://github.com/sunzeyeah/RLHF) --- 本仓库由[THUDM/glm-large-chinese](https://huggingface.co/THUDM/glm-large-chinese) fork而来,原仓库实现了PyTorch版本的GLM模型,该模型有3.5亿参数量,模型权重文件以FP32格式存储。 本仓库在原始代码的基础上进行了部分调整,以支持ChatGPT训练pipeline,具体实现可参考:[sunzeyeah/RLHF](https://github.com/sunzeyeah/RLHF). This repository is forked from [THUDM/glm-large-chinese](https://huggingface.co/THUDM/glm-large-chinese) that contains PyTorch implementation of GLM model with 350 million parameters pretrained weights (FP32 precision). It is slightly different from the original GLM implementation to support the ChatGPT training pipeline in this github repo: [sunzeyeah/RLHF](https://github.com/sunzeyeah/RLHF). --- # Model description GLM is a General Language Model pretrained with an autoregressive blank-filling objective and can be finetuned on various natural language understanding and generation tasks. Please refer to our paper for a detailed description of GLM: [GLM: General Language Model Pretraining with Autoregressive Blank Infilling](https://arxiv.org/abs/2103.10360) (ACL 2022) Zhengxiao Du*, Yujie Qian*, Xiao Liu, Ming Ding, Jiezhong Qiu, Zhilin Yang, Jie Tang (*: equal contribution) Find more examples in our [Github repo](https://github.com/THUDM/GLM). `glm-10b-chinese` is pretrained on the [WuDaoCorpora](https://www.sciencedirect.com/science/article/pii/S2666651021000152) dataset. It has 48 transformer layers, with hidden size 4096 and 64 attention heads in each layer. The model is pretrained with autoregressive blank filling objectives designed for natural language understanding, seq2seq, and language modeling. --- # Usage (Text Generation) ```python from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("sunzeyeah/glm-350M-chinese", trust_remote_code=True) model = AutoModelForSeq2SeqLM.from_pretrained("sunzeyeah/glm-350M-chinese", trust_remote_code=True) model = model.half().cuda() max_length = 512 prompt = "我不能确定对方是不是喜欢我,我却想分分秒秒跟他在一起,有谁能告诉我如何能想他少一点" prefix = "回答:" encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token) prompt_length = len(encoded_prompt['input_ids']) encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token, max_length=min(prompt_length, max_length), truncation="only_first", return_tensors="pt", return_token_type_ids=False) max_gen_length = max_length - encoded_dict['input_ids'].shape[1] inputs = tokenizer.build_inputs_for_generation(encoded_dict, max_gen_length=max_gen_length, padding=True) inputs = inputs.cuda() outputs = model.generate(**inputs, max_new_tokens=max_gen_length, eos_token_id=tokenizer.eop_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=False, num_return_sequences=1, top_p=0.8, temperature=1.0) results = tokenizer.batch_decode(outputs, skip_special_tokens=True) print(results) ``` ================================================ FILE: src/resources/models/glm-350M-chinese/added_tokens.json ================================================ { "<|endoftext|>": 50000, "[SEP]": 50001, "[CLS]": 50002, "[MASK]": 50003, "[UNUSED1]": 50004, "[UNUSED2]": 50005, "<|startofpiece|>": 50006, "<|endofpiece|>": 50007, "[sMASK]": 50008, "[gMASK]": 50009 } ================================================ FILE: src/resources/models/glm-350M-chinese/config.json ================================================ { "model_name_or_path": "sunzeyeah/glm-350M-chinese", "architectures": [ "GLMModel" ], "attention_dropout_prob": 0.1, "attention_scale": 1.0, "auto_map": { "AutoConfig": "configuration_glm.GLMConfig", "AutoModel": "modeling_glm.GLMModel", "AutoModelForSeq2SeqLM": "modeling_glm.GLMForConditionalGeneration", "AutoModelForMultipleChoice": "modeling_glm.GLMForMultipleChoice" }, "block_position_encoding": true, "checkpoint_activations": false, "checkpoint_num_layers": 1, "embedding_dropout_prob": 0.1, "hidden_size": 1024, "initializer_range": 0.02, "max_sequence_length": 1024, "model_type": "glm", "num_attention_heads": 16, "num_layers": 24, "output_dropout_prob": 0.1, "output_predict": true, "parallel_output": true, "pool_token": "cls", "relative_encoding": false, "spell_func": "lstm", "spell_length": null, "torch_dtype": "float32", "vocab_size": 50048, "pad_token_id": 50000 } ================================================ FILE: src/resources/models/glm-350M-chinese/configuration_glm.py ================================================ # coding=utf-8 # Copyright 2022 shunxing1234 and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ GLM model configuration """ from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) GLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { "shunxing1234/GLM": "https://huggingface.co/shunxing1234/GLM/resolve/main/config.json", # See all GLM models at https://huggingface.co/models?filter=glm } class GLMConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`~GLMModel`]. It is used to instantiate an GLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the GLM [shunxing1234/GLM-base-cased](https://huggingface.co/shunxing1234/GLM-base-cased) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 30522): Vocabulary size of the GLM model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`~GLMModel`] or [`~TFGLMModel`]. hidden_size (`int`, *optional*, defaults to 768): Dimension of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 12): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (`int`, *optional*, defaults to 3072): Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. hidden_dropout_prob (`float`, *optional*, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): The dropout ratio for the attention probabilities. max_position_embeddings (`int`, *optional*, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (`int`, *optional*, defaults to 2): The vocabulary size of the `token_type_ids` passed when calling [`~GLMModel`] or [`~TFGLMModel`]. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. Example: ```python >>> from transformers import GLMModel, GLMConfig >>> # Initializing a GLM shunxing1234/GLM-base-cased style configuration >>> configuration = GLMConfig() >>> # Initializing a model from the shunxing1234/GLM-base-cased style configuration >>> model = GLMModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ``` """ model_type = "glm" attribute_map = { "num_hidden_layers": "num_layers" } def __init__( self, num_layers=24, vocab_size=30592, hidden_size=1024, num_attention_heads=16, embedding_dropout_prob=0.1, attention_dropout_prob=0.1, output_dropout_prob=0.1, max_sequence_length=512, checkpoint_activations=False, checkpoint_num_layers=1, parallel_output=True, relative_encoding=False, block_position_encoding=True, output_predict=False, spell_length=None, spell_func="lstm", attention_scale=1.0, initializer_range=0.02, pool_token="cls", **kwargs ): self.num_layers = num_layers self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.embedding_dropout_prob = embedding_dropout_prob self.attention_dropout_prob = attention_dropout_prob self.output_dropout_prob = output_dropout_prob self.max_sequence_length = max_sequence_length self.checkpoint_activations = checkpoint_activations self.checkpoint_num_layers = checkpoint_num_layers self.parallel_output = parallel_output self.relative_encoding = relative_encoding self.block_position_encoding = block_position_encoding self.output_predict = output_predict self.spell_length = spell_length self.spell_func = spell_func self.attention_scale = attention_scale self.initializer_range = initializer_range self.pool_token = pool_token super().__init__(**kwargs) ================================================ FILE: src/resources/models/glm-350M-chinese/modeling_glm.py ================================================ # coding=utf-8 # Copyright 2022 shunxing1234 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch GLM model. """ import math import torch import torch.utils.checkpoint import torch.nn.functional as F from torch.nn import init, LayerNorm, Linear, CrossEntropyLoss from transformers.activations import gelu from transformers.utils import ( add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, ) from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, ModelOutput, ) from transformers.modeling_utils import ( PreTrainedModel, ) from .configuration_glm import GLMConfig from torch.nn.parameter import Parameter _CHECKPOINT_FOR_DOC = "shunxing1234/GLM" _CONFIG_FOR_DOC = "GLMConfig" _TOKENIZER_FOR_DOC = "GLMTokenizer" GLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ "shunxing1234/GLM", # See all GLM models at https://huggingface.co/models?filter=glm ] def unscaled_init_method(sigma): """Init method based on N(0, sigma).""" def init_(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) return init_ def scaled_init_method(mean, std, num_layers): """Init method based on N(0, sigma/sqrt(2*num_layers).""" std = std / math.sqrt(2.0 * num_layers) def init_(tensor): return torch.nn.init.normal_(tensor, mean=mean, std=std) return init_ def ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" assert numerator % denominator == 0, '{} is not divisible by {}'.format( numerator, denominator) def divide(numerator, denominator): """Ensure that numerator is divisible by the denominator and return the division value.""" ensure_divisibility(numerator, denominator) return numerator // denominator def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False): """Split a tensor along its last dimension. Arguments: tensor: input tensor. num_partitions: number of partitions to split the tensor contiguous_split_chunks: If True, make each chunk contiguous in memory. """ # Get the size and dimension. last_dim = tensor.dim() - 1 last_dim_size = divide(tensor.size()[last_dim], num_partitions) # Split. tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) # Note: torch.split does not create contiguous tensors by default. if contiguous_split_chunks: return tuple(chunk.contiguous() for chunk in tensor_list) return tensor_list class MLP(torch.nn.Module): """MLP for GPT2. MLP will take the input with h hidden state, project it to 4*h hidden dimension, perform gelu transformation, and project the state back into h hidden dimension. At the end, dropout is also applied. Arguments: hidden_size: The hidden size of the self attention. output_dropout_prob: dropout probability for the outputs after self attention and final output. init_method: initialization method used for the weights. Note that all biases are initialized to zero and layernorm weight are initialized to one. output_layer_init_method: output layer initialization. If None, use `init_method`. """ def __init__(self, hidden_size, output_dropout_prob, init_method, output_layer_init_method=None): super(MLP, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Project to 4h. self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size) # Project back to h. self.dense_4h_to_h = Linear( 4 * hidden_size, hidden_size) self.dropout = torch.nn.Dropout(output_dropout_prob) def forward(self, hidden_states): # [b, s, 4hp] intermediate_parallel = self.dense_h_to_4h(hidden_states) intermediate_parallel = gelu(intermediate_parallel) # [b, s, h] output = self.dense_4h_to_h(intermediate_parallel) output = self.dropout(output) return output class VocabEmbedding(torch.nn.Module): """Embedding parallelized in the vocabulary dimension. This is mainly adapted from torch.nn.Embedding and all the default values are kept. Arguments: num_embeddings: vocabulary size. embedding_dim: size of hidden state. init_method: method to initialize weights. """ def __init__(self, config): super(VocabEmbedding, self).__init__() # Keep the input dimensions. self.num_embeddings = config.vocab_size self.embedding_dim = config.hidden_size # Set the detauls for compatibility. self.padding_idx = None self.max_norm = None self.norm_type = 2. self.scale_grad_by_freq = False self.sparse = False self._weight = None self.vocab_start_index = 0 self.vocab_end_index = self.num_embeddings # Allocate weights. weight = torch.Tensor(self.num_embeddings, self.embedding_dim) self.register_buffer("weight", weight) # And initialize. init.xavier_normal_(self.weight) def forward(self, input_): # Get the embeddings. output = F.embedding(input_, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse) return output class PositionalEmbedding(torch.nn.Module): def __init__(self, hidden_size): super(PositionalEmbedding, self).__init__() self.hidden_size = hidden_size inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size)) self.register_buffer('inv_freq', inv_freq) def forward(self, pos_seq, bsz=None): sinusoid_inp = torch.ger(pos_seq, self.inv_freq) pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) if bsz is not None: return pos_emb[None, :, :].expand(bsz, -1, -1) else: return pos_emb[None, :, :] class SelfAttention(torch.nn.Module): """self-attention layer for GLM. Self-attention layer takes input with size [b, s, h] where b is the batch size, s is the sequence lenght, and h is the hidden size and creates output of the same size. Arguments: hidden_size: total hidden size of the layer (h). num_attention_heads: number of attention heads (n). Note that we require n to be divisible by number of GPUs used to parallelize the model. Also, we require hidden size to be divisible by n. attention_dropout_prob: dropout probability for the attention scores. init_method: weight initialization. output_layer_init_method: output layer initialization. If None, use `init_method`. We use the following notation: h: hidden_size n: num_attention_heads p: number of partitions np: n/p hp: h/p hn: h/n b: batch size s: sequence length """ def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=None, attention_scale=1.0): super(SelfAttention, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Per attention head and per partition values. self.hidden_size = hidden_size self.hidden_size_per_attention_head = divide(hidden_size, num_attention_heads) self.num_attention_heads = num_attention_heads self.attention_scale = attention_scale # Strided linear layer. self.query_key_value = Linear(hidden_size, 3 * hidden_size) # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but # on average it should not be partition dependent. self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) # Output. self.dense = Linear(hidden_size, hidden_size) self.output_dropout = torch.nn.Dropout(output_dropout_prob) def _transpose_for_scores(self, tensor): """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with size [b, np, s, hn]. """ new_tensor_shape = tensor.size()[:-1] + \ (self.num_attention_heads, self.hidden_size_per_attention_head) tensor = tensor.view(*new_tensor_shape) return tensor.permute(0, 2, 1, 3) def forward(self, hidden_states, ltor_mask, mem=None): # hidden_states: [b, s, h] # ltor_mask: [b,1,s,s] # Attention heads. [b, s, hp] query_length = hidden_states.size(1) # self attention if mem is None: mixed_x_layer = self.query_key_value(hidden_states) (mixed_query_layer, mixed_key_layer, mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) else: cat = torch.cat((mem, hidden_states), 1) mixed_x_layer = self.query_key_value(cat) (mixed_query_layer, mixed_key_layer, mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) mixed_query_layer = mixed_query_layer[:, -query_length:] # Reshape and transpose [b, np, s, hn] query_layer = self._transpose_for_scores(mixed_query_layer) key_layer = self._transpose_for_scores(mixed_key_layer) value_layer = self._transpose_for_scores(mixed_value_layer) if self.attention_scale > 1.0: # Raw attention scores. [b, np, s, s] attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_scale), key_layer.transpose(-1, -2) / math.sqrt( self.hidden_size_per_attention_head * self.attention_scale)) else: attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2) / math.sqrt( self.hidden_size_per_attention_head)) # Apply the left to right attention mask. ltor_mask = ltor_mask.type_as(attention_scores) attention_scores = torch.mul(attention_scores, ltor_mask) if self.attention_scale > 1.0: max_attention_scores = attention_scores.max(dim=-1, keepdim=True)[0] attention_scores -= max_attention_scores attention_scores *= self.attention_scale attention_scores = attention_scores + (-65504.0) * (1.0 - ltor_mask) # Attention probabilities. [b, np, s, s] attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. # with get_cuda_rng_tracker().fork(): attention_probs = self.attention_dropout(attention_probs) # Context layer. # [b, np, s, hn] context_layer = torch.matmul(attention_probs, value_layer) # [b, s, np, hn] context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + \ (self.hidden_size,) # [b, s, hp] context_layer = context_layer.view(*new_context_layer_shape) # Output. [b, s, h] output = self.dense(context_layer) output = self.output_dropout(output) return output class GLMBlock(torch.nn.Module): """A single layer transformer for GLM. We use the following notation: h: hidden size n: number of attention heads b: batch size s: sequence length Transformore layer takes input with size [b, s, h] and returns an output of the same size. Arguments: hidden_size: The hidden size of the self attention. num_attention_heads: number of attention head in the self attention. attention_dropout_prob: dropout probability of the attention score in self attention. output_dropout_prob: dropout probability for the outputs after self attention and final output. layernorm_epsilon: epsilon used in layernorm to avoid division by zero. init_method: initialization method used for the weights. Note that all biases are initialized to zero and layernorm weight are initialized to one. output_layer_init_method: output layers (attention output and mlp output) initialization. If None, use `init_method`. """ def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, init_method, output_layer_init_method=None, attention_scale=1.0): super(GLMBlock, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Layernorm on the input data. self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # Self attention. self.attention = SelfAttention( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method, attention_scale=attention_scale) # Layernorm on the input data. self.post_attention_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # MLP self.mlp = MLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) def forward(self, hidden_states, ltor_mask, mem=None): # hidden_states: [b, s, h] # ltor_mask: [b,1, s,s] # Layer norm at the begining of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) mem = self.input_layernorm(mem) if mem is not None else None # Self attention. attention_output = self.attention(layernorm_output, ltor_mask, mem) # Residual connection. layernorm_input = hidden_states + attention_output # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) # MLP. mlp_output = self.mlp(layernorm_output) # Second residual connection. output = layernorm_input + mlp_output return output class GLMStack(torch.nn.Module): """GLM transformer. This module takes input from embedding layer and it's output can be used directly by a logit layer. It consists of L (num-layers) blocks of: layer norm self attention residual connection layer norm mlp residual connection followed by a final layer norm. Arguments: num_layers: Number of transformer layers. hidden_size: The hidden size of the self attention. num_attention_heads: number of attention head in the self attention. attention_dropout_prob: dropout probability of the attention score in self attention. output_dropout_prob: dropout probability for the outputs after self attention and final output. checkpoint_activations: if True, checkpoint activations. checkpoint_num_layers: number of layers to checkpoint. This is basically the chunk size in checkpoitning. layernorm_epsilon: epsilon used in layernorm to avoid division by zero. init_method_std: standard deviation of the init method which has the form N(0, std). use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers) scaling for the output weights ( output of self attention and mlp). """ def __init__(self, num_layers, hidden_size, num_attention_heads, max_sequence_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers=1, layernorm_epsilon=1.0e-5, init_method_std=0.02, use_scaled_init_for_output_weights=True, block_position_encoding=False, attention_scale=1.0, ): super(GLMStack, self).__init__() self.hidden_size = hidden_size # Store activation checkpoiting flag. self.checkpoint_activations = checkpoint_activations self.checkpoint_num_layers = checkpoint_num_layers output_layer_init_method = None if use_scaled_init_for_output_weights: output_layer_init_method = scaled_init_method(0.0, init_method_std, num_layers) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) self.block_position_encoding = block_position_encoding # Position embedding (serial). if block_position_encoding: self.position_embeddings = torch.nn.Embedding(max_sequence_length + 1, hidden_size) self.block_position_embeddings = torch.nn.Embedding(max_sequence_length + 1, hidden_size) torch.nn.init.normal_(self.block_position_embeddings.weight, mean=0.0, std=init_method_std) else: self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Initialize the position embeddings. torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std) def get_layer(): return GLMBlock( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, unscaled_init_method(init_method_std), output_layer_init_method=output_layer_init_method, attention_scale=attention_scale) # Transformer layers. self.layers = torch.nn.ModuleList( [get_layer() for _ in range(num_layers)]) # Final layer norm before output. self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) def forward(self, hidden_states, position_ids, attention_mask, memory_states=None): batch_size, query_length = hidden_states.size()[:2] memory_length = memory_states[0].size(1) if memory_states else 0 # attention mask is the beginning postion of B region, \in [0, query_len) is_scalar = torch.numel(attention_mask) == 1 is_sep = is_scalar or torch.numel(attention_mask) == batch_size if is_sep: sep = attention_mask.item() if is_scalar else attention_mask # conventional transformer def build_mask_matrix(seq_length, sep, memory_length=0): m = hidden_states.new_ones((1, seq_length, seq_length)) m = torch.tril(m) if is_scalar: m[0, :, :int(sep)] = 1 else: m = m.expand(batch_size, -1, -1) ids = torch.arange(seq_length, device=sep.device, dtype=sep.dtype).view(1, -1) mask = ids < sep.view(-1, 1) m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1) if memory_length > 0: m = m.expand(batch_size, -1, -1) m = torch.cat((hidden_states.new_ones((batch_size, seq_length, memory_length)), m), dim=2) m = m.unsqueeze(1) return m attention_mask = build_mask_matrix(query_length, sep, memory_length=memory_length) else: if attention_mask.dim() == 2: attention_mask = attention_mask.unsqueeze(1).unsqueeze(1) attention_mask = attention_mask[:, :, :, -query_length - memory_length:] if self.block_position_encoding: position_ids, block_position_ids = position_ids[:, 0], position_ids[:, 1] position_embeddings = self.position_embeddings(position_ids) hidden_states = hidden_states + position_embeddings if self.block_position_encoding: block_position_embeddings = self.block_position_embeddings(block_position_ids) hidden_states = hidden_states + block_position_embeddings hidden_states = self.embedding_dropout(hidden_states) def check_detach(_hidden_states): return _hidden_states.detach() mem_layers = [check_detach(hidden_states)] for i, layer in enumerate(self.layers): args = [hidden_states, attention_mask] def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs) return custom_forward mem_i = memory_states[i] if memory_states else None if self.checkpoint_activations: hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, mem=mem_i, ) else: hidden_states = layer(*args, mem=mem_i) mem_layers.append(check_detach(hidden_states)) # Final layer norm. output = self.final_layernorm(hidden_states) mem_layers = self.update_mems(mem_layers, memory_states) return (output, mem_layers) def update_mems(self, hiddens, mems): memory_length = mems[0].size(1) if mems else 0 query_length = hiddens[0].size(1) new_memory_length = memory_length + query_length new_mems = [] # with torch.no_grad(): for i in range(len(hiddens)): if new_memory_length <= query_length: new_mems.append(hiddens[i][:, -new_memory_length:]) else: new_mems.append(torch.cat((mems[i][:, -new_memory_length + query_length:], hiddens[i]), dim=1)) return new_mems class GLMPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = GLMConfig base_model_prefix = "glm" supports_gradient_checkpointing = True _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, torch.nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, torch.nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, torch.nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, GLMModel): module.gradient_checkpointing = value GLM_START_DOCSTRING = r""" This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config ([`~GLMConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ GLM_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`GLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`: - 0 corresponds to a *sentence A* token, - 1 corresponds to a *sentence B* token. [What are token type IDs?](../glossary#token-type-ids) position_ids (`torch.LongTensor` of shape `({0})`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. [What are position IDs?](../glossary#position-ids) head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @add_start_docstrings( "The bare GLM Model transformer outputting raw hidden-states without any specific head on top.", GLM_START_DOCSTRING, ) class GLMModel(GLMPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in [Attention is all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ def __init__(self, config): super().__init__(config) self.config = config self.output_predict = config.output_predict # Word embeddings (parallel). self.word_embeddings = VocabEmbedding(config) # Transformer self.transformer = GLMStack(config.num_layers, config.hidden_size, config.num_attention_heads, config.max_sequence_length, config.embedding_dropout_prob, config.attention_dropout_prob, config.output_dropout_prob, config.checkpoint_activations, config.checkpoint_num_layers, attention_scale=config.attention_scale, block_position_encoding=config.block_position_encoding) # Initialize weights and apply final processing self.post_init() @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, position_ids=None, attention_mask=None, mems=None, **kwargs ): batch_size = input_ids.size(0) words_embeddings = self.word_embeddings(input_ids) embeddings = words_embeddings device = input_ids.device input_shape = input_ids.size() if position_ids is None: position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device) block_position_ids = torch.zeros(input_shape[-1], dtype=torch.long, device=device) position_ids = torch.stack((position_ids, block_position_ids), dim=0).unsqueeze(0) if attention_mask is None: attention_mask = torch.zeros(batch_size) # Transformer. transformer_output = self.transformer(embeddings, position_ids, attention_mask, mems) logits, hidden_layers = transformer_output # outputs = hidden_layers if self.output_predict: # Parallel logits. # logits_parallel = mpu.copy_to_model_parallel_region( # logits) logits = F.linear(logits, self.word_embeddings.weight) return ModelOutput( logits=logits, mems=hidden_layers, ) @add_start_docstrings( """GLM Model transformer for multiple choice classification""", GLM_START_DOCSTRING ) class GLMForMultipleChoice(GLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.glm = GLMModel(config) self.post_init() def forward( self, input_ids=None, position_ids=None, attention_mask=None, choice_ids=None, choice_indices=None, labels=None, mems=None, **kwargs ): model_output = self.glm.forward(input_ids, position_ids, attention_mask, mems=mems, **kwargs) lm_logits = model_output.logits log_probs = [] for output, choices, choice_index in zip(F.log_softmax(lm_logits, dim=-1), choice_ids, choice_indices): log_probs_single = [] for choice, choice_target_id in zip(choices, choice_index): tmp = output[choice_target_id, choice] log_probs_single.append(tmp.sum()) log_probs.append(torch.stack(log_probs_single)) log_probs = torch.stack(log_probs) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(log_probs, labels) return ModelOutput( loss=loss, logits=log_probs, lm_logits=lm_logits, mems=model_output.mems ) @add_start_docstrings( """GLM Model transformer with a `language modeling` head on top""", GLM_START_DOCSTRING, ) class GLMForConditionalGeneration(GLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.glm = GLMModel(config) self.post_init() def _reorder_cache(self, past, beam_idx): # if decoder past is not included in output # speedy decoding is disabled and no need to reorder if past is None: return past reordered_decoder_past = () for layer_past_states in past: # get the correct batch idx from layer past batch dim reordered_decoder_past = reordered_decoder_past + ( layer_past_states.index_select(0, beam_idx.to(layer_past_states.device)),) return reordered_decoder_past def prepare_inputs_for_generation(self, input_ids, past=None, position_ids=None, generation_attention_mask=None, **kwargs): # only last token for inputs_ids if past is defined in kwargs attention_mask = generation_attention_mask seq_length = input_ids.shape[1] if past: if position_ids is not None: position_ids = position_ids[:, :, seq_length - 1].unsqueeze(-1) if attention_mask is not None: attention_mask = attention_mask[:, :, seq_length - 1, :seq_length].unsqueeze(-2) input_ids = input_ids[:, -1].unsqueeze(-1) else: if position_ids is not None: position_ids = position_ids[:, :, :seq_length] if attention_mask is not None: attention_mask = attention_mask[:, :, :seq_length, :seq_length] return { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask, "mems": past, } def forward( self, input_ids=None, position_ids=None, attention_mask=None, labels=None, mems=None, **kwargs ): model_output = self.glm.forward(input_ids, position_ids, attention_mask, mems=mems, **kwargs) lm_logits = model_output.logits loss = None if labels is not None: loss_fct = CrossEntropyLoss(ignore_index=self.config.pad_token_id) loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) return ModelOutput( loss=loss, logits=lm_logits, mems=model_output.mems ) ================================================ FILE: src/resources/models/glm-350M-chinese/tokenization_glm.py ================================================ import os from typing import Optional, Tuple, List, Union from shutil import copyfile import torch from transformers import PreTrainedTokenizer, RobertaTokenizer, GPT2Tokenizer, BertTokenizer from transformers.utils import logging from transformers.tokenization_utils_base import BatchEncoding from transformers.models.auto.tokenization_auto import get_tokenizer_config # from transformers.utils import torch_required from transformers.utils.generic import _is_torch_device import sentencepiece as spm logger = logging.get_logger(__name__) class GLMBatchEncoding(BatchEncoding): # @torch_required def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": """ Send all values to device by calling `v.to(device)` (PyTorch only). Args: device (`str` or `torch.device`): The device to put the tensors on. Returns: [`BatchEncoding`]: The same instance after modification. """ # This check catches things like APEX blindly calling "to" on all inputs to a module # Otherwise it passes the casts down and casts the LongTensor containing the token idxs # into a HalfTensor if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int): self.data = {k: v.to(device=device) if torch.is_tensor(v) else v for k, v in self.data.items()} else: logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") return self class GLMTokenizerMixin: @property def sop_token(self) -> Optional[str]: return "<|startofpiece|>" @property def sop_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the start token in the vocabulary, used when training a model with autoregressive blank filling. """ return self.convert_tokens_to_ids(self.sop_token) @property def eop_token(self) -> Optional[str]: return "<|endofpiece|>" @property def eop_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the end token in the vocabulary, used when training a model with autoregressive blank filling. """ return self.convert_tokens_to_ids(self.eop_token) @property def gmask_token_id(self) -> int: return self.convert_tokens_to_ids("[gMASK]") @property def smask_token_id(self) -> int: return self.convert_tokens_to_ids("[sMASK]") @property def mask_token_ids(self): return [self.mask_token_id, self.smask_token_id, self.gmask_token_id] def _build_input_for_multiple_choice(self, context, choices): context_id = context["input_ids"] if torch.is_tensor(context_id): context_id = context_id.tolist() division = len(context_id) mask_position = context_id.index(self.mask_token_id) token = torch.tensor(context_id, dtype=torch.long) attention_mask = [context["attention_mask"].expand(division, -1)] position_id = torch.arange(division, dtype=torch.long) block_position_id = torch.zeros(division, dtype=torch.long) choice_ids, choice_indices = [], [] for choice_str in choices: choice = torch.tensor(self(choice_str, add_special_tokens=False, padding=False)['input_ids'], dtype=torch.long) choice_ids.append(choice) choice_indices.append(torch.arange(len(token), len(token) + len(choice), dtype=torch.long)) attention_mask.append(torch.tril(torch.ones((len(choice), len(choice)), dtype=torch.long))) token = torch.cat((token, torch.tensor([self.sop_token_id], dtype=torch.long), choice[:-1])) position_id = torch.cat((position_id, torch.tensor([mask_position] * len(choice), dtype=torch.long))) block_position_id = torch.cat((block_position_id, torch.arange(1, 1 + len(choice), dtype=torch.long))) attention_mask = torch.block_diag(*attention_mask) attention_mask[division:, :division] = context["attention_mask"].unsqueeze(0) return { "input_ids": token, "position_ids": torch.stack((position_id, block_position_id)), "attention_mask": attention_mask, "choice_ids": choice_ids, "choice_indices": choice_indices } def _pad_batch(self, tokens, position_ids, attention_mask, max_seq_length): pad_length = max_seq_length - len(tokens) attention_mask = torch.nn.functional.pad( attention_mask, (0, pad_length, 0, pad_length), mode="constant", value=0, ) tokens = torch.cat((tokens, torch.zeros(pad_length, dtype=torch.long))) position_ids = torch.cat((position_ids, position_ids[..., -1:].expand(-1, pad_length)), dim=-1) return tokens, position_ids, attention_mask def _collate(self, samples): TILE = 1 length_to_pad = (max(map(lambda spl: len(spl["input_ids"]), samples)) + TILE - 1) // TILE * TILE token_batch, position_id_batch, attention_mask_batch = [], [], [] choices_batch, choice_target_ids_batch = [], [] for sample in samples: token, position_id, attention_mask = self._pad_batch( sample["input_ids"], sample["position_ids"], sample["attention_mask"], length_to_pad ) token_batch.append(token) position_id_batch.append(position_id) attention_mask_batch.append(attention_mask) choices_batch.append(sample["choice_ids"]) choice_target_ids_batch.append(sample["choice_indices"]) return { "input_ids": torch.stack(token_batch), "position_ids": torch.stack(position_id_batch), "attention_mask": torch.stack(attention_mask_batch).unsqueeze(1), "choice_ids": choices_batch, "choice_indices": choice_target_ids_batch, } def build_inputs_for_multiple_choice(self, model_input: BatchEncoding, choices, max_length=None): samples = [{key: value[i] for key, value in model_input.items()} for i in range(len(model_input["input_ids"]))] samples = [self._build_input_for_multiple_choice(sample, choice) for sample, choice in zip(samples, choices)] inputs = self._collate(samples) return GLMBatchEncoding(inputs) def build_inputs_for_generation(self, model_input: BatchEncoding, max_gen_length=512, targets=None, padding=False): mask_ids = self.mask_token_ids input_ids = model_input.input_ids batch_size, seq_length = input_ids.shape[:2] labels = None # create labels if targets is not None: is_batched = isinstance(targets, (list, tuple)) targets = self(targets, add_special_tokens=False, padding=False).input_ids if not is_batched: targets = [targets] assert len(targets) == len(input_ids) targets = [target[:(max_gen_length-1)] + [self.eop_token_id] for target in targets] if not padding: max_gen_length = max(map(len, targets)) targets = [[self.sop_token_id] + target for target in targets] labels = [target[1:] for target in targets] targets = [target + [self.pad_token_id] * (max_gen_length + 1 - len(target)) for target in targets] labels = [label + [self.pad_token_id] * (max_gen_length - len(label)) for label in labels] targets = torch.tensor(targets, dtype=input_ids.dtype, device=input_ids.device) labels = torch.tensor(labels, dtype=input_ids.dtype, device=input_ids.device) labels = torch.cat((input_ids.new_full((batch_size, seq_length), self.pad_token_id), labels), dim=1) # create position ids position_id, block_position_id = list(range(seq_length)), [0 for _ in range(seq_length)] position_ids, block_position_ids = [], [] for i in range(batch_size): mask_positions = [] for mask_id in mask_ids: mask_positions += (input_ids[i] == mask_id).nonzero(as_tuple=True)[0].tolist() if not mask_positions: raise ValueError("Cannot find mask token in the input") mask_positions.sort() mask_pos = mask_positions[0] position_ids.append(position_id + [mask_pos] * max_gen_length) block_position_ids.append(block_position_id + list(range(1, max_gen_length + 1))) position_ids = torch.tensor(position_ids, dtype=input_ids.dtype, device=input_ids.device) block_position_ids = torch.tensor(block_position_ids, dtype=input_ids.dtype, device=input_ids.device) position_ids = torch.stack((position_ids, block_position_ids), dim=1) # create attention mask attention_mask = model_input.attention_mask attention_mask = attention_mask.unsqueeze(1).expand(-1, seq_length + max_gen_length, -1) generation_attention_mask = torch.cat([attention_mask.new_zeros((seq_length, max_gen_length)), torch.tril(attention_mask.new_ones((max_gen_length, max_gen_length)))], dim=0).unsqueeze(0).expand(batch_size, -1, -1) attention_mask = torch.cat((attention_mask, generation_attention_mask), dim=2) attention_mask = attention_mask.unsqueeze(1) # create input ids if targets is None: input_ids = torch.cat((input_ids, input_ids.new_full((batch_size, 1), self.sop_token_id)), dim=-1) else: input_ids = torch.cat((input_ids, targets[:, :-1]), dim=1) batch = {"input_ids": input_ids, "position_ids": position_ids} if labels is None: batch["generation_attention_mask"] = attention_mask else: batch["attention_mask"] = attention_mask batch["labels"] = labels return BatchEncoding(batch) class GLMRobertaTokenizer(RobertaTokenizer, GLMTokenizerMixin): model_input_names = ["input_ids", "position_ids", "attention_mask"] truncation_side: str = "left" @property def gmask_token_id(self) -> int: raise NotImplementedError("The model doesn't support gMASK") @property def smask_token_id(self) -> int: raise NotImplementedError("The model doesn't support sMASK") @property def mask_token_ids(self): return [self.mask_token_id] class GLMChineseTokenizer(PreTrainedTokenizer, GLMTokenizerMixin): vocab_files_names = {"vocab_file": "cog-pretrain.model"} truncation_side: str = "left" def __init__(self, vocab_file, **kwargs): super().__init__(**kwargs) self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return len(self.sp_model) def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text, **kwargs): return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): return self.sp_model.decode(tokens) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"] ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) return (out_vocab_file,) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ #assert token_ids_1 is None #cls = [self.cls_token_id] #eos = [self.eos_token_id] #return cls + token_ids_0 + eos cls = [self.cls_token_id] eos = [self.eos_token_id] #eop = [self.eop_token_id] #mask = [self.mask_token_id] sep = [self.sep_token_id] #token_ids_0 = cls + token_ids_0 + mask + eos if token_ids_1 is None: return cls + token_ids_0 + eos else: return cls + token_ids_0 + sep + token_ids_1 + eos class GLMGPT2Tokenizer(GPT2Tokenizer, GLMTokenizerMixin): model_input_names = ["input_ids", "position_ids", "attention_mask"] truncation_side: str = "left" def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ assert token_ids_1 is None cls = [self.cls_token_id] eos = [self.eos_token_id] return cls + token_ids_0 + eos class GLMBertTokenizer(BertTokenizer, GLMTokenizerMixin): model_input_names = ["input_ids", "position_ids", "attention_mask"] truncation_side: str = "left" @property def gmask_token_id(self) -> int: raise NotImplementedError("The model doesn't support gMASK") @property def smask_token_id(self) -> int: raise NotImplementedError("The model doesn't support sMASK") @property def mask_token_ids(self): return [self.mask_token_id] class GLMTokenizer: @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) config_tokenizer_class = tokenizer_config.get("tokenizer_class") if config_tokenizer_class == "GLMRobertaTokenizer": tokenizer_class = GLMRobertaTokenizer elif config_tokenizer_class == "GLMChineseTokenizer": tokenizer_class = GLMChineseTokenizer elif config_tokenizer_class == "GLMGPT2Tokenizer": tokenizer_class = GLMGPT2Tokenizer elif config_tokenizer_class == "GLMBertTokenizer": tokenizer_class = GLMBertTokenizer else: raise NotImplementedError("Not implemented tokenizer type:", config_tokenizer_class) return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) ================================================ FILE: src/resources/models/glm-350M-chinese/tokenizer_config.json ================================================ { "name_or_path": "BAAI/glm-large-chinese", "eos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", "cls_token": "[CLS]", "mask_token": "[MASK]", "unk_token": "[UNK]", "sep_token": "[SEP]", "additional_special_tokens": ["<|startofpiece|>", "<|endofpiece|>", "[gMASK]", "[sMASK]"], "add_prefix_space": false, "tokenizer_class": "GLMChineseTokenizer", "use_fast": false, "auto_map": { "AutoTokenizer": [ "tokenization_glm.GLMChineseTokenizer", null ] } } ================================================ FILE: src/resources/models/pangu-13B/config.json ================================================ { "model_name_or_path": "sunzeyeah/pangu-13B", "activation_function": "gelu", "architectures": [ "GPTPanguForCausalLM" ], "attn_pdrop": 0.1, "embd_pdrop": 0.1, "hidden_size": 5120, "initializer_range": 0.02, "intermediate_size": null, "layer_norm_epsilon": 1e-05, "max_position_embeddings": 1024, "model_type": "pangu", "num_heads": 40, "num_layers": 40, "resid_pdrop": 0.1, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "torch_dtype": "float32", "use_cache": true, "vocab_size": 40064, "tokenizer_class": "GPTPanguTokenizer", "auto_map": { "AutoConfig": "configuration_gptpangu.GPTPanguConfig", "AutoTokenizer": ["tokenization_gptpangu.GPTPanguTokenizer", null], "AutoModelForCausalLM": "modeling_gptpangu.GPTPanguForCausalLM" }, "pad_token_id": 6 } ================================================ FILE: src/resources/models/pangu-13B/configuration_gptpangu.py ================================================ from transformers.configuration_utils import PretrainedConfig class GPTPanguConfig(PretrainedConfig): model_type = "gpt_pangu" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=40000, max_position_embeddings=1024, hidden_size=5120, intermediate_size=None, num_layers=40, num_heads=40, activation_function="gelu", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, scale_attn_weights=True, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, use_cache=True, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_layers = num_layers self.num_heads = num_heads self.activation_function = activation_function self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.scale_attn_weights = scale_attn_weights self.initializer_range = initializer_range self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels self.use_cache = use_cache super().__init__(**kwargs) ================================================ FILE: src/resources/models/pangu-13B/modeling_gptpangu.py ================================================ """PyTorch PanguAlpha GPT2 Model""" # from .configuration_gptpangu import GPTPanguConfig from typing import Tuple import math import torch from torch import nn from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.utils import logging logger = logging.get_logger(__name__) class GPTPanguAttention(nn.Module): def __init__(self, config): super().__init__() max_positions = config.max_position_embeddings self.register_buffer( "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view( 1, 1, max_positions, max_positions ), ) self.register_buffer("masked_bias", torch.tensor(-1e4)) self.embed_dim = config.hidden_size self.num_heads = config.num_heads self.head_dim = self.embed_dim // self.num_heads if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." ) self.scale_attn_weights = config.scale_attn_weights self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) def _attn(self, query, key, value, attention_mask=None, head_mask=None): attn_weights = torch.matmul(query, key.transpose(-1, -2)) if self.scale_attn_weights: attn_weights = attn_weights / (float(value.size(-1)) ** 0.5) query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool() attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)) if attention_mask is not None: # Apply the attention mask attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise attn_weights = attn_weights.type(value.dtype) attn_weights = self.attn_dropout(attn_weights) # Mask heads if we want to if head_mask is not None: attn_weights = attn_weights * head_mask attn_output = torch.matmul(attn_weights, value) return attn_output, attn_weights def _split_heads(self, tensor, num_heads, attn_head_size): """ Splits hidden_size dim into attn_head_size and num_heads """ new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) tensor = tensor.view(*new_shape) return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) def _merge_heads(self, tensor, num_heads, attn_head_size): """ Merges attn_head_size dim and num_attn_heads dim into hidden_size """ tensor = tensor.permute(0, 2, 1, 3).contiguous() new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) return tensor.view(new_shape) def forward( self, hidden_states, layer_past=None, attention_mask=None, head_mask=None, custom_query=None, use_cache=False, output_attentions=False, ): query = self.q_proj(custom_query) if custom_query is not None else self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) query = self._split_heads(query, self.num_heads, self.head_dim) key = self._split_heads(key, self.num_heads, self.head_dim) value = self._split_heads(value, self.num_heads, self.head_dim) if layer_past is not None: past_key, past_value = layer_past key = torch.cat((past_key, key), dim=-2) value = torch.cat((past_value, value), dim=-2) if use_cache is True: present = (key, value) else: present = None attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) attn_output = self.c_proj(attn_output) attn_output = self.resid_dropout(attn_output) outputs = (attn_output, present) if output_attentions: outputs += (attn_weights,) return outputs # a, present, (attentions) class GPTPanguMLP(nn.Module): def __init__(self, intermediate_size, config): # in MLP: intermediate_size= 4 * hidden_size super().__init__() embed_dim = config.hidden_size self.c_fc = nn.Linear(embed_dim, intermediate_size) self.c_proj = nn.Linear(intermediate_size, embed_dim) self.act = ACT2FN[config.activation_function] self.dropout = nn.Dropout(config.resid_pdrop) def forward(self, hidden_states): hidden_states = self.c_fc(hidden_states) hidden_states = self.act(hidden_states) hidden_states = self.c_proj(hidden_states) hidden_states = self.dropout(hidden_states) return hidden_states class GPTPanguBlock(nn.Module): def __init__(self, config): super().__init__() hidden_size = config.hidden_size inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = GPTPanguAttention(config) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = GPTPanguMLP(inner_dim, config) def forward( self, hidden_states, layer_past=None, attention_mask=None, head_mask=None, custom_query=None, use_cache=False, output_attentions=False, ): residual = hidden_states hidden_states = self.ln_1(hidden_states) attn_outputs = self.attn( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, custom_query=custom_query, use_cache=use_cache, output_attentions=output_attentions, ) attn_output = attn_outputs[0] # output_attn: a, present, (attentions) outputs = attn_outputs[1:] # residual connection hidden_states = attn_output + residual residual = hidden_states hidden_states = self.ln_2(hidden_states) feed_forward_hidden_states = self.mlp(hidden_states) # residual connection hidden_states = residual + feed_forward_hidden_states if use_cache: outputs = (hidden_states,) + outputs else: outputs = (hidden_states,) + outputs[1:] return outputs # hidden_states, present, (attentions, cross_attentions) class GPTPanguPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ # config_class = GPTPanguConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, (nn.Linear,)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. # > -- GPT-2 :: https://openai.com/blog/better-language-models/ # # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py for name, p in module.named_parameters(): if "c_proj" in name and "weight" in name: # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_layers))) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, GPTPanguModel): module.gradient_checkpointing = value class GPTPanguModel(GPTPanguPreTrainedModel): def __init__(self, config): super().__init__(config) self.embed_dim = config.hidden_size self.wte = nn.Embedding(config.vocab_size, self.embed_dim) self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.wqe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([GPTPanguBlock(config) for _ in range(config.num_layers)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.wte def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past_key_values is None: past_length = 0 past_key_values = tuple([None] * len(self.h)) else: past_length = past_key_values[0][0].size(-2) if position_ids is None: position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # GPT2Attention mask. if attention_mask is not None: if batch_size <= 0: raise ValueError("batch_size has to be defined and > 0") attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, None, None, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. if attention_mask.dtype == torch.int or attention_mask.dtype == torch.bool: attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x num_heads x N x N # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_layers) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds if token_type_ids is not None: token_type_embeds = self.wte(token_type_ids) hidden_states = hidden_states + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) # top attention custom query last_layer_id = len(self.h) - 1 query_embeds = self.wqe(position_ids) presents = () if use_cache else None all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): # Final LayerNorm before last query layer if i == last_layer_id: hidden_states = self.ln_f(hidden_states) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False def create_custom_forward(module): def custom_forward(*inputs): assert len(inputs) == 1 input_0 = inputs[0] return module(**input_0, use_cache=use_cache, output_attentions=output_attentions) return custom_forward outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), { "hidden_states": hidden_states, "layer_past": None, "attention_mask": attention_mask, "head_mask": head_mask[i], "custom_query": query_embeds if i == last_layer_id else None } ) else: outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], # custom query custom_query=query_embeds if i == last_layer_id else None, use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = outputs[0] if use_cache is True: presents = presents + (outputs[1],) if output_attentions: all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) hidden_states = hidden_states.view(*output_shape) # Add last hidden state if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, ) class GPTPanguForCausalLM(GPTPanguPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = GPTPanguModel(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): token_type_ids = kwargs.get("token_type_ids", None) # only last token for inputs_ids if past is defined in kwargs if past: input_ids = input_ids[:, -1].unsqueeze(-1) if token_type_ids is not None: token_type_ids = token_type_ids[:, -1].unsqueeze(-1) attention_mask = kwargs.get("attention_mask", None) position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation position_ids = attention_mask.int().cumsum(-1).long() - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past: position_ids = position_ids[:, -1].unsqueeze(-1) else: position_ids = None return { "input_ids": input_ids, "past_key_values": past, "use_cache": kwargs.get("use_cache"), "position_ids": position_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, } def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) @staticmethod def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: """ This function is used to re-order the :obj:`past_key_values` cache if :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step. """ return tuple( tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) for layer_past in past ) ================================================ FILE: src/resources/models/pangu-13B/tokenization_gptpangu.py ================================================ import os import torch import sentencepiece import jieba import numpy as np from transformers.tokenization_utils import PreTrainedTokenizer jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') class GPTPanguTokenizer(PreTrainedTokenizer): # Ref: https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha/src/branch/master/tokenization_jieba.py vocab_files_names = { "model_file": "vocab.model" } def __init__( self, model_file, **kwargs ): super().__init__(**kwargs) self.sp = sentencepiece.SentencePieceProcessor() self.sp.Load(model_file=model_file) self.translator = str.maketrans(" \n", "\u2582\u2583") self.vocab_file = model_file # special token ids # self.eos_token_id = self.sp.piece_to_id("") def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: `[CLS] X [SEP]` - pair of sequences: `[CLS] A [SEP] B [SEP]` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ if self.bos_token_id is not None: if token_ids_1 is None: return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] bos = [self.bos_token_id] sep = [self.sep_token_id] eos = [self.eos_token_id] return bos + token_ids_0 + sep + token_ids_1 + eos else: if token_ids_1 is None: return token_ids_0 + [self.eos_token_id] sep = [self.sep_token_id] eos = [self.eos_token_id] return token_ids_0 + sep + token_ids_1 + eos def tokenize(self, text, **kwargs): """ Tokenize a string. """ seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)] return seg_list def convert_tokens_to_ids(self, tokens): if tokens is None: return None if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) special_tokens_index = [i for i, token in enumerate(tokens) if token in self.all_special_tokens] ids = [] i = 0 for j in special_tokens_index: new_seg = " ".join(tokens[i:j]) ids.extend(self.sp.encode(new_seg)) ids.append(self._convert_token_to_id(tokens[j])) i = j + 1 new_seg = " ".join(tokens[i:]) ids.extend(self.sp.encode(new_seg)) return ids # new_seg = " ".join(tokens) # return self.sp.encode(new_seg) # # return tokens def _convert_token_to_id(self, token): return self.sp.piece_to_id(token) def _convert_id_to_token(self, index): return self.sp.id_to_piece(index) def convert_ids_to_tokens(self, ids): return self.decode(ids) def decode(self, ids, **kwargs): if isinstance(ids, torch.Tensor) or isinstance(ids, np.ndarray): ids = ids.tolist() if kwargs.get('skip_special_tokens', None) is True: ids = [token_id for token_id in ids if token_id not in self.all_special_ids] text = self.sp.decode(ids) if isinstance(text, list): text = text[0] text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')#.replace('⁇', self.unk_token) return text @property def vocab_size(self) -> int: """ `int`: Size of the base vocabulary (without the added tokens). """ return len(self.sp) def save_vocabulary(self, save_directory, filename_prefix=None): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. filename_prefix (`str`, *optional*): An optional prefix to add to the named of the saved files. Returns: `Tuple(str)`: Paths to the files saved. """ if os.path.isdir(save_directory): vocab_file = os.path.join( save_directory, self.vocab_files_names["model_file"] ) else: vocab_file = save_directory with open(self.vocab_file, 'rb') as fin: proto_str = fin.read() with open(vocab_file, "wb") as writer: writer.write(proto_str) return (vocab_file,) ================================================ FILE: src/resources/models/pangu-13B/tokenizer_config.json ================================================ { "eos_token": "", "pad_token": "", "unk_token": "", "sep_token": "", "bos_token": "", "add_prefix_space": false, "tokenizer_class": "GPTPanguTokenizer", "use_fast": false, "auto_map": { "AutoTokenizer": [ "tokenization_gptpangu.GPTPanguTokenizer", null ] } } ================================================ FILE: src/resources/models/pangu-2.6B/config.json ================================================ { "model_name_or_path": "sunzeyeah/pangu-2.6B", "activation_function": "gelu", "architectures": [ "GPTPanguForCausalLM" ], "attn_pdrop": 0.1, "embd_pdrop": 0.1, "hidden_size": 2560, "initializer_range": 0.02, "intermediate_size": null, "layer_norm_epsilon": 1e-05, "max_position_embeddings": 1024, "model_type": "pangu", "num_heads": 32, "num_layers": 32, "resid_pdrop": 0.1, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "torch_dtype": "float32", "transformers_version": "4.13.0", "use_cache": true, "vocab_size": 40000, "tokenizer_class": "GPTPanguTokenizer", "auto_map": { "AutoConfig": "configuration_gptpangu.GPTPanguConfig", "AutoTokenizer": ["tokenization_gptpangu.GPTPanguTokenizer", null], "AutoModelForCausalLM": "modeling_gptpangu.GPTPanguForCausalLM" }, "pad_token_id": 6 } ================================================ FILE: src/resources/models/pangu-2.6B/configuration_gptpangu.py ================================================ from transformers.configuration_utils import PretrainedConfig class GPTPanguConfig(PretrainedConfig): model_type = "gpt_pangu" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=40000, max_position_embeddings=1024, hidden_size=2560, intermediate_size=None, num_layers=32, num_heads=32, activation_function="gelu", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, scale_attn_weights=True, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, use_cache=True, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_layers = num_layers self.num_heads = num_heads self.activation_function = activation_function self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.scale_attn_weights = scale_attn_weights self.initializer_range = initializer_range self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels self.use_cache = use_cache super().__init__(**kwargs) ================================================ FILE: src/resources/models/pangu-2.6B/modeling_gptpangu.py ================================================ """PyTorch PanguAlpha GPT2 Model""" # from .configuration_gptpangu import GPTPanguConfig from typing import Tuple import math import torch from torch import nn from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.utils import logging logger = logging.get_logger(__name__) class GPTPanguAttention(nn.Module): def __init__(self, config): super().__init__() max_positions = config.max_position_embeddings self.register_buffer( "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view( 1, 1, max_positions, max_positions ), ) self.register_buffer("masked_bias", torch.tensor(-1e4)) self.embed_dim = config.hidden_size self.num_heads = config.num_heads self.head_dim = self.embed_dim // self.num_heads if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." ) self.scale_attn_weights = config.scale_attn_weights self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) def _attn(self, query, key, value, attention_mask=None, head_mask=None): attn_weights = torch.matmul(query, key.transpose(-1, -2)) if self.scale_attn_weights: attn_weights = attn_weights / (float(value.size(-1)) ** 0.5) query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool() attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)) if attention_mask is not None: # Apply the attention mask attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise attn_weights = attn_weights.type(value.dtype) attn_weights = self.attn_dropout(attn_weights) # Mask heads if we want to if head_mask is not None: attn_weights = attn_weights * head_mask attn_output = torch.matmul(attn_weights, value) return attn_output, attn_weights def _split_heads(self, tensor, num_heads, attn_head_size): """ Splits hidden_size dim into attn_head_size and num_heads """ new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) tensor = tensor.view(*new_shape) return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) def _merge_heads(self, tensor, num_heads, attn_head_size): """ Merges attn_head_size dim and num_attn_heads dim into hidden_size """ tensor = tensor.permute(0, 2, 1, 3).contiguous() new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) return tensor.view(new_shape) def forward( self, hidden_states, layer_past=None, attention_mask=None, head_mask=None, custom_query=None, use_cache=False, output_attentions=False, ): query = self.q_proj(custom_query) if custom_query is not None else self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) query = self._split_heads(query, self.num_heads, self.head_dim) key = self._split_heads(key, self.num_heads, self.head_dim) value = self._split_heads(value, self.num_heads, self.head_dim) if layer_past is not None: past_key, past_value = layer_past key = torch.cat((past_key, key), dim=-2) value = torch.cat((past_value, value), dim=-2) if use_cache is True: present = (key, value) else: present = None attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) attn_output = self.c_proj(attn_output) attn_output = self.resid_dropout(attn_output) outputs = (attn_output, present) if output_attentions: outputs += (attn_weights,) return outputs # a, present, (attentions) class GPTPanguMLP(nn.Module): def __init__(self, intermediate_size, config): # in MLP: intermediate_size= 4 * hidden_size super().__init__() embed_dim = config.hidden_size self.c_fc = nn.Linear(embed_dim, intermediate_size) self.c_proj = nn.Linear(intermediate_size, embed_dim) self.act = ACT2FN[config.activation_function] self.dropout = nn.Dropout(config.resid_pdrop) def forward(self, hidden_states): hidden_states = self.c_fc(hidden_states) hidden_states = self.act(hidden_states) hidden_states = self.c_proj(hidden_states) hidden_states = self.dropout(hidden_states) return hidden_states class GPTPanguBlock(nn.Module): def __init__(self, config): super().__init__() hidden_size = config.hidden_size inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = GPTPanguAttention(config) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = GPTPanguMLP(inner_dim, config) def forward( self, hidden_states, layer_past=None, attention_mask=None, head_mask=None, custom_query=None, use_cache=False, output_attentions=False, ): residual = hidden_states hidden_states = self.ln_1(hidden_states) attn_outputs = self.attn( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, custom_query=custom_query, use_cache=use_cache, output_attentions=output_attentions, ) attn_output = attn_outputs[0] # output_attn: a, present, (attentions) outputs = attn_outputs[1:] # residual connection hidden_states = attn_output + residual residual = hidden_states hidden_states = self.ln_2(hidden_states) feed_forward_hidden_states = self.mlp(hidden_states) # residual connection hidden_states = residual + feed_forward_hidden_states if use_cache: outputs = (hidden_states,) + outputs else: outputs = (hidden_states,) + outputs[1:] return outputs # hidden_states, present, (attentions, cross_attentions) class GPTPanguPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ # config_class = GPTPanguConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, (nn.Linear,)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. # > -- GPT-2 :: https://openai.com/blog/better-language-models/ # # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py for name, p in module.named_parameters(): if "c_proj" in name and "weight" in name: # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_layers))) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, GPTPanguModel): module.gradient_checkpointing = value class GPTPanguModel(GPTPanguPreTrainedModel): def __init__(self, config): super().__init__(config) self.embed_dim = config.hidden_size self.wte = nn.Embedding(config.vocab_size, self.embed_dim) self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.wqe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([GPTPanguBlock(config) for _ in range(config.num_layers)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.wte def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past_key_values is None: past_length = 0 past_key_values = tuple([None] * len(self.h)) else: past_length = past_key_values[0][0].size(-2) if position_ids is None: position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # GPT2Attention mask. if attention_mask is not None: if batch_size <= 0: raise ValueError("batch_size has to be defined and > 0") attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, None, None, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. if attention_mask.dtype == torch.int or attention_mask.dtype == torch.bool: attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x num_heads x N x N # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_layers) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds if token_type_ids is not None: token_type_embeds = self.wte(token_type_ids) hidden_states = hidden_states + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) # top attention custom query last_layer_id = len(self.h) - 1 query_embeds = self.wqe(position_ids) presents = () if use_cache else None all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): # Final LayerNorm before last query layer if i == last_layer_id: hidden_states = self.ln_f(hidden_states) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False def create_custom_forward(module): def custom_forward(*inputs): assert len(inputs) == 1 input_0 = inputs[0] return module(**input_0, use_cache=use_cache, output_attentions=output_attentions) return custom_forward outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), { "hidden_states": hidden_states, "layer_past": None, "attention_mask": attention_mask, "head_mask": head_mask[i], "custom_query": query_embeds if i == last_layer_id else None } ) else: outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], # custom query custom_query=query_embeds if i == last_layer_id else None, use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = outputs[0] if use_cache is True: presents = presents + (outputs[1],) if output_attentions: all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) hidden_states = hidden_states.view(*output_shape) # Add last hidden state if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, ) class GPTPanguForCausalLM(GPTPanguPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = GPTPanguModel(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): token_type_ids = kwargs.get("token_type_ids", None) # only last token for inputs_ids if past is defined in kwargs if past: input_ids = input_ids[:, -1].unsqueeze(-1) if token_type_ids is not None: token_type_ids = token_type_ids[:, -1].unsqueeze(-1) attention_mask = kwargs.get("attention_mask", None) position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation position_ids = attention_mask.int().cumsum(-1).long() - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past: position_ids = position_ids[:, -1].unsqueeze(-1) else: position_ids = None return { "input_ids": input_ids, "past_key_values": past, "use_cache": kwargs.get("use_cache"), "position_ids": position_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, } def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) @staticmethod def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: """ This function is used to re-order the :obj:`past_key_values` cache if :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step. """ return tuple( tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) for layer_past in past ) ================================================ FILE: src/resources/models/pangu-2.6B/tokenization_gptpangu.py ================================================ import os import torch import sentencepiece import jieba import numpy as np from transformers.tokenization_utils import PreTrainedTokenizer jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') class GPTPanguTokenizer(PreTrainedTokenizer): # Ref: https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha/src/branch/master/tokenization_jieba.py vocab_files_names = { "model_file": "vocab.model" } def __init__( self, model_file, **kwargs ): super().__init__(**kwargs) self.sp = sentencepiece.SentencePieceProcessor() self.sp.Load(model_file=model_file) self.translator = str.maketrans(" \n", "\u2582\u2583") self.vocab_file = model_file # special token ids # self.eos_token_id = self.sp.piece_to_id("") def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: `[CLS] X [SEP]` - pair of sequences: `[CLS] A [SEP] B [SEP]` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ if self.bos_token_id is not None: if token_ids_1 is None: return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] bos = [self.bos_token_id] sep = [self.sep_token_id] eos = [self.eos_token_id] return bos + token_ids_0 + sep + token_ids_1 + eos else: if token_ids_1 is None: return token_ids_0 + [self.eos_token_id] sep = [self.sep_token_id] eos = [self.eos_token_id] return token_ids_0 + sep + token_ids_1 + eos def tokenize(self, text, **kwargs): """ Tokenize a string. """ seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)] return seg_list def convert_tokens_to_ids(self, tokens): if tokens is None: return None if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) special_tokens_index = [i for i, token in enumerate(tokens) if token in self.all_special_tokens] ids = [] i = 0 for j in special_tokens_index: new_seg = " ".join(tokens[i:j]) ids.extend(self.sp.encode(new_seg)) ids.append(self._convert_token_to_id(tokens[j])) i = j + 1 new_seg = " ".join(tokens[i:]) ids.extend(self.sp.encode(new_seg)) return ids # new_seg = " ".join(tokens) # return self.sp.encode(new_seg) # # return tokens def _convert_token_to_id(self, token): return self.sp.piece_to_id(token) def _convert_id_to_token(self, index): return self.sp.id_to_piece(index) def convert_ids_to_tokens(self, ids): return self.decode(ids) def decode(self, ids, **kwargs): if isinstance(ids, torch.Tensor) or isinstance(ids, np.ndarray): ids = ids.tolist() if kwargs.get('skip_special_tokens', None) is True: ids = [token_id for token_id in ids if token_id not in self.all_special_ids] text = self.sp.decode(ids) if isinstance(text, list): text = text[0] text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')#.replace('⁇', self.unk_token) return text @property def vocab_size(self) -> int: """ `int`: Size of the base vocabulary (without the added tokens). """ return len(self.sp) def save_vocabulary(self, save_directory, filename_prefix=None): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. filename_prefix (`str`, *optional*): An optional prefix to add to the named of the saved files. Returns: `Tuple(str)`: Paths to the files saved. """ if os.path.isdir(save_directory): vocab_file = os.path.join( save_directory, self.vocab_files_names["model_file"] ) else: vocab_file = save_directory with open(self.vocab_file, 'rb') as fin: proto_str = fin.read() with open(vocab_file, "wb") as writer: writer.write(proto_str) return (vocab_file,) ================================================ FILE: src/resources/models/pangu-2.6B/tokenizer_config.json ================================================ { "eos_token": "", "pad_token": "", "unk_token": "", "sep_token": "", "bos_token": "", "add_prefix_space": false, "tokenizer_class": "GPTPanguTokenizer", "use_fast": false, "auto_map": { "AutoTokenizer": [ "tokenization_gptpangu.GPTPanguTokenizer", null ] } } ================================================ FILE: src/resources/models/pangu-350M/README-Reward.md ================================================ Link to github: [here](https://github.com/sunzeyeah/RLHF) --- # Model Description Pangu-α is proposed by a joint technical team headed by PCNL. It was first released in [this repository](https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha) It is the first large-scale Chinese pre-trained language model with 200 billion parameters trained on 2048 Ascend processors using an automatic hybrid parallel training strategy. The whole training process is done on the “Peng Cheng Cloud Brain II” computing platform with the domestic deep learning framework called MindSpore. The PengCheng·PanGu-α pre-training model can support rich applications, has strong few-shot learning capabilities, and has outstanding performance in text generation tasks such as knowledge question and answer, knowledge retrieval, knowledge reasoning, and reading comprehension. This repository contains PyTorch implementation of PanGu model with 350 million parameters pretrained weights (FP32 precision). It uses supervised finetuned [pangu-350M-sft](https://huggingface.co/sunzeyeah/pangu-350M-sft) and performs **reward training** on [Chinese Chatgpt Corpus](https://huggingface.co/datasets/sunzeyeah/chinese_chatgpt_corpus). ================================================ FILE: src/resources/models/pangu-350M/README-SFT.md ================================================ Link to github: [here](https://github.com/sunzeyeah/RLHF) --- # Model Description Pangu-α is proposed by a joint technical team headed by PCNL. It was first released in [this repository](https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha) It is the first large-scale Chinese pre-trained language model with 200 billion parameters trained on 2048 Ascend processors using an automatic hybrid parallel training strategy. The whole training process is done on the “Peng Cheng Cloud Brain II” computing platform with the domestic deep learning framework called MindSpore. The PengCheng·PanGu-α pre-training model can support rich applications, has strong few-shot learning capabilities, and has outstanding performance in text generation tasks such as knowledge question and answer, knowledge retrieval, knowledge reasoning, and reading comprehension. This repository contains PyTorch implementation of PanGu model with 350 million parameters pretrained weights (FP32 precision). It uses pretrained pangu-350M model and perform **supervised finetuning (SFT)** on [Chinese Chatgpt Corpus](https://huggingface.co/datasets/sunzeyeah/chinese_chatgpt_corpus). --- # Usage (Text Generation) Currently PanGu model is not supported by transformers, so `trust_remote_code=True` is required to load model implementation in this repo. ```python from transformers import TextGenerationPipeline, AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("sunzeyeah/pangu-350M-sft", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("sunzeyeah/pangu-350M-sft", trust_remote_code=True) prompt = "我不能确定对方是不是喜欢我,我却想分分秒秒跟他在一起,有谁能告诉我如何能想他少一点回答:" inputs = tokenizer(prompt, add_special_tokens=False, return_token_type_ids=False, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.pad_token_id, do_sample=False, num_return_sequences=1, top_p=0.8, temperature=0.8) results = tokenizer.batch_decode(outputs, skip_special_tokens=True) results = [result.split("答:", maxsplit=1)[1] for result in results] print(results) ``` Expected output: ```python ["你爱他就多关心他,少在他面前表现出你的脆弱。这样他才更会爱你。"] ``` ================================================ FILE: src/resources/models/pangu-350M/README.md ================================================ --- language: - zh tags: - pangu - chatgpt --- Link to github: [here](https://github.com/sunzeyeah/RLHF) --- # Model Description Pangu-α is proposed by a joint technical team headed by PCNL. It was first released in [this repository](https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha) It is the first large-scale Chinese pre-trained language model with 200 billion parameters trained on 2048 Ascend processors using an automatic hybrid parallel training strategy. The whole training process is done on the “Peng Cheng Cloud Brain II” computing platform with the domestic deep learning framework called MindSpore. The PengCheng·PanGu-α pre-training model can support rich applications, has strong few-shot learning capabilities, and has outstanding performance in text generation tasks such as knowledge question and answer, knowledge retrieval, knowledge reasoning, and reading comprehension. This repository contains PyTorch implementation of PanGu model with 350 million parameters pretrained weights (FP32 precision). It is slightly different from the [original pangu implementation](https://huggingface.co/imone/pangu_2_6B) to support the ChatGPT training pipeline in this github repo: [sunzeyeah/RLHF](https://github.com/sunzeyeah/RLHF). --- ================================================ FILE: src/resources/models/pangu-350M/config.json ================================================ { "model_name_or_path": "sunzeyeah/pangu-350M", "activation_function": "gelu", "architectures": [ "GPTPanguForCausalLM" ], "attn_pdrop": 0.1, "embd_pdrop": 0.1, "hidden_size": 1024, "initializer_range": 0.02, "intermediate_size": null, "layer_norm_epsilon": 1e-05, "max_position_embeddings": 1024, "model_type": "pangu", "num_heads": 16, "num_layers": 24, "resid_pdrop": 0.1, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "torch_dtype": "float32", "vocab_size": 40000, "tokenizer_class": "GPTPanguTokenizer", "auto_map": { "AutoConfig": "configuration_gptpangu.GPTPanguConfig", "AutoTokenizer": ["tokenization_gptpangu.GPTPanguTokenizer", null], "AutoModelForCausalLM": "modeling_gptpangu.GPTPanguForCausalLM" }, "pad_token_id": 6 } ================================================ FILE: src/resources/models/pangu-350M/configuration_gptpangu.py ================================================ from transformers.configuration_utils import PretrainedConfig class GPTPanguConfig(PretrainedConfig): model_type = "gpt_pangu" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=40000, max_position_embeddings=1024, hidden_size=1024, intermediate_size=None, num_layers=24, num_heads=16, activation_function="gelu", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, scale_attn_weights=True, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, use_cache=True, # bos_token_id=9, # eos_token_id=9, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_layers = num_layers self.num_heads = num_heads self.activation_function = activation_function self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.scale_attn_weights = scale_attn_weights self.initializer_range = initializer_range self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels self.use_cache = use_cache # self.bos_token_id = bos_token_id # self.eos_token_id = eos_token_id super().__init__(**kwargs) ================================================ FILE: src/resources/models/pangu-350M/modeling_gptpangu.py ================================================ """PyTorch PanguAlpha GPT2 Model""" # from .configuration_gptpangu import GPTPanguConfig from typing import Tuple import math import torch from torch import nn from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.utils import logging logger = logging.get_logger(__name__) class GPTPanguAttention(nn.Module): def __init__(self, config): super().__init__() max_positions = config.max_position_embeddings self.register_buffer( "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view( 1, 1, max_positions, max_positions ), ) self.register_buffer("masked_bias", torch.tensor(-1e4)) self.embed_dim = config.hidden_size self.num_heads = config.num_heads self.head_dim = self.embed_dim // self.num_heads if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." ) self.scale_attn_weights = config.scale_attn_weights self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) def _attn(self, query, key, value, attention_mask=None, head_mask=None): attn_weights = torch.matmul(query, key.transpose(-1, -2)) if self.scale_attn_weights: attn_weights = attn_weights / (float(value.size(-1)) ** 0.5) query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool() attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)) if attention_mask is not None: # Apply the attention mask attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise attn_weights = attn_weights.type(value.dtype) attn_weights = self.attn_dropout(attn_weights) # Mask heads if we want to if head_mask is not None: attn_weights = attn_weights * head_mask attn_output = torch.matmul(attn_weights, value) return attn_output, attn_weights def _split_heads(self, tensor, num_heads, attn_head_size): """ Splits hidden_size dim into attn_head_size and num_heads """ new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) tensor = tensor.view(*new_shape) return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) def _merge_heads(self, tensor, num_heads, attn_head_size): """ Merges attn_head_size dim and num_attn_heads dim into hidden_size """ tensor = tensor.permute(0, 2, 1, 3).contiguous() new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) return tensor.view(new_shape) def forward( self, hidden_states, layer_past=None, attention_mask=None, head_mask=None, custom_query=None, use_cache=False, output_attentions=False, ): query = self.q_proj(custom_query) if custom_query is not None else self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) query = self._split_heads(query, self.num_heads, self.head_dim) key = self._split_heads(key, self.num_heads, self.head_dim) value = self._split_heads(value, self.num_heads, self.head_dim) if layer_past is not None: past_key, past_value = layer_past key = torch.cat((past_key, key), dim=-2) value = torch.cat((past_value, value), dim=-2) if use_cache is True: present = (key, value) else: present = None attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) attn_output = self.c_proj(attn_output) attn_output = self.resid_dropout(attn_output) outputs = (attn_output, present) if output_attentions: outputs += (attn_weights,) return outputs # a, present, (attentions) class GPTPanguMLP(nn.Module): def __init__(self, intermediate_size, config): # in MLP: intermediate_size= 4 * hidden_size super().__init__() embed_dim = config.hidden_size self.c_fc = nn.Linear(embed_dim, intermediate_size) self.c_proj = nn.Linear(intermediate_size, embed_dim) self.act = ACT2FN[config.activation_function] self.dropout = nn.Dropout(config.resid_pdrop) def forward(self, hidden_states): hidden_states = self.c_fc(hidden_states) hidden_states = self.act(hidden_states) hidden_states = self.c_proj(hidden_states) hidden_states = self.dropout(hidden_states) return hidden_states class GPTPanguBlock(nn.Module): def __init__(self, config): super().__init__() hidden_size = config.hidden_size inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = GPTPanguAttention(config) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = GPTPanguMLP(inner_dim, config) def forward( self, hidden_states, layer_past=None, attention_mask=None, head_mask=None, custom_query=None, use_cache=False, output_attentions=False, ): residual = hidden_states hidden_states = self.ln_1(hidden_states) attn_outputs = self.attn( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, custom_query=custom_query, use_cache=use_cache, output_attentions=output_attentions, ) attn_output = attn_outputs[0] # output_attn: a, present, (attentions) outputs = attn_outputs[1:] # residual connection hidden_states = attn_output + residual residual = hidden_states hidden_states = self.ln_2(hidden_states) feed_forward_hidden_states = self.mlp(hidden_states) # residual connection hidden_states = residual + feed_forward_hidden_states if use_cache: outputs = (hidden_states,) + outputs else: outputs = (hidden_states,) + outputs[1:] return outputs # hidden_states, present, (attentions, cross_attentions) class GPTPanguPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ # config_class = GPTPanguConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, (nn.Linear,)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. # > -- GPT-2 :: https://openai.com/blog/better-language-models/ # # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py for name, p in module.named_parameters(): if "c_proj" in name and "weight" in name: # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_layers))) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, GPTPanguModel): module.gradient_checkpointing = value class GPTPanguModel(GPTPanguPreTrainedModel): def __init__(self, config): super().__init__(config) self.embed_dim = config.hidden_size self.wte = nn.Embedding(config.vocab_size, self.embed_dim) self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.wqe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([GPTPanguBlock(config) for _ in range(config.num_layers)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.wte def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past_key_values is None: past_length = 0 past_key_values = tuple([None] * len(self.h)) else: past_length = past_key_values[0][0].size(-2) if position_ids is None: position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # GPT2Attention mask. if attention_mask is not None: if batch_size <= 0: raise ValueError("batch_size has to be defined and > 0") attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, None, None, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. if attention_mask.dtype == torch.int or attention_mask.dtype == torch.bool: attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x num_heads x N x N # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_layers) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds if token_type_ids is not None: token_type_embeds = self.wte(token_type_ids) hidden_states = hidden_states + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) # top attention custom query last_layer_id = len(self.h) - 1 query_embeds = self.wqe(position_ids) presents = () if use_cache else None all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): # Final LayerNorm before last query layer if i == last_layer_id: hidden_states = self.ln_f(hidden_states) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False def create_custom_forward(module): def custom_forward(*inputs): assert len(inputs) == 1 input_0 = inputs[0] return module(**input_0, use_cache=use_cache, output_attentions=output_attentions) return custom_forward outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), { "hidden_states": hidden_states, "layer_past": None, "attention_mask": attention_mask, "head_mask": head_mask[i], "custom_query": query_embeds if i == last_layer_id else None } ) else: outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], # custom query custom_query=query_embeds if i == last_layer_id else None, use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = outputs[0] if use_cache is True: presents = presents + (outputs[1],) if output_attentions: all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) hidden_states = hidden_states.view(*output_shape) # Add last hidden state if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, ) class GPTPanguForCausalLM(GPTPanguPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = GPTPanguModel(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): token_type_ids = kwargs.get("token_type_ids", None) # only last token for inputs_ids if past is defined in kwargs if past: input_ids = input_ids[:, -1].unsqueeze(-1) if token_type_ids is not None: token_type_ids = token_type_ids[:, -1].unsqueeze(-1) attention_mask = kwargs.get("attention_mask", None) position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation position_ids = attention_mask.int().cumsum(-1).long() - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past: position_ids = position_ids[:, -1].unsqueeze(-1) else: position_ids = None return { "input_ids": input_ids, "past_key_values": past, "use_cache": kwargs.get("use_cache"), "position_ids": position_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, } def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) @staticmethod def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: """ This function is used to re-order the :obj:`past_key_values` cache if :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step. """ return tuple( tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) for layer_past in past ) ================================================ FILE: src/resources/models/pangu-350M/tokenization_gptpangu.py ================================================ import os import torch import sentencepiece import jieba import numpy as np from transformers.tokenization_utils import PreTrainedTokenizer jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') jieba.add_word('') class GPTPanguTokenizer(PreTrainedTokenizer): # Ref: https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha/src/branch/master/tokenization_jieba.py vocab_files_names = { "model_file": "vocab.model" } def __init__( self, model_file, **kwargs ): super().__init__(**kwargs) self.sp = sentencepiece.SentencePieceProcessor() self.sp.Load(model_file=model_file) self.translator = str.maketrans(" \n", "\u2582\u2583") self.vocab_file = model_file # special token ids # self.eos_token_id = self.sp.piece_to_id("") def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: `[CLS] X [SEP]` - pair of sequences: `[CLS] A [SEP] B [SEP]` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ if self.bos_token_id is not None: if token_ids_1 is None: return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] bos = [self.bos_token_id] sep = [self.sep_token_id] eos = [self.eos_token_id] return bos + token_ids_0 + sep + token_ids_1 + eos else: if token_ids_1 is None: return token_ids_0 + [self.eos_token_id] sep = [self.sep_token_id] eos = [self.eos_token_id] return token_ids_0 + sep + token_ids_1 + eos def tokenize(self, text, **kwargs): """ Tokenize a string. """ seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)] return seg_list def convert_tokens_to_ids(self, tokens): if tokens is None: return None if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) special_tokens_index = [i for i, token in enumerate(tokens) if token in self.all_special_tokens] ids = [] i = 0 for j in special_tokens_index: new_seg = " ".join(tokens[i:j]) ids.extend(self.sp.encode(new_seg)) ids.append(self._convert_token_to_id(tokens[j])) i = j + 1 new_seg = " ".join(tokens[i:]) ids.extend(self.sp.encode(new_seg)) return ids # new_seg = " ".join(tokens) # return self.sp.encode(new_seg) # # return tokens def _convert_token_to_id(self, token): return self.sp.piece_to_id(token) def _convert_id_to_token(self, index): return self.sp.id_to_piece(index) def convert_ids_to_tokens(self, ids): return self.decode(ids) def decode(self, ids, **kwargs): if isinstance(ids, torch.Tensor) or isinstance(ids, np.ndarray): ids = ids.tolist() if kwargs.get('skip_special_tokens', None) is True: ids = [token_id for token_id in ids if token_id not in self.all_special_ids] text = self.sp.decode(ids) if isinstance(text, list): text = text[0] text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')#.replace('⁇', self.unk_token) return text @property def vocab_size(self) -> int: """ `int`: Size of the base vocabulary (without the added tokens). """ return len(self.sp) def save_vocabulary(self, save_directory, filename_prefix=None): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. filename_prefix (`str`, *optional*): An optional prefix to add to the named of the saved files. Returns: `Tuple(str)`: Paths to the files saved. """ if os.path.isdir(save_directory): vocab_file = os.path.join( save_directory, self.vocab_files_names["model_file"] ) else: vocab_file = save_directory with open(self.vocab_file, 'rb') as fin: proto_str = fin.read() with open(vocab_file, "wb") as writer: writer.write(proto_str) return (vocab_file,) ================================================ FILE: src/resources/models/pangu-350M/tokenizer_config.json ================================================ { "eos_token": "", "pad_token": "", "unk_token": "", "sep_token": "", "bos_token": "", "add_prefix_space": false, "tokenizer_class": "GPTPanguTokenizer", "use_fast": false, "auto_map": { "AutoTokenizer": [ "tokenization_gptpangu.GPTPanguTokenizer", null ] } } ================================================ FILE: src/resources/models/qwen-7B-chat/config.json ================================================ { "activation": "swiglu", "apply_residual_connection_post_layernorm": false, "architectures": [ "QWenLMHeadModel" ], "auto_map": { "AutoConfig": "configuration_qwen.QWenConfig", "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel" }, "attn_pdrop": 0.0, "bf16": false, "bias_dropout_fusion": true, "bos_token_id": 151643, "embd_pdrop": 0.1, "eos_token_id": 151643, "ffn_hidden_size": 22016, "fp16": false, "initializer_range": 0.02, "kv_channels": 128, "layer_norm_epsilon": 1e-05, "model_type": "qwen", "n_embd": 4096, "n_head": 32, "n_layer": 32, "n_positions": 6144, "no_bias": true, "onnx_safe": null, "padded_vocab_size": 151936, "params_dtype": "torch.bfloat16", "pos_emb": "rotary", "resid_pdrop": 0.1, "rotary_emb_base": 10000, "rotary_pct": 1.0, "scale_attn_weights": true, "seq_length": 2048, "tie_word_embeddings": false, "tokenizer_type": "QWenTokenizer", "transformers_version": "4.31.0", "use_cache": true, "use_flash_attn": true, "vocab_size": 151936, "use_dynamic_ntk": true, "use_logn_attn": true } ================================================ FILE: src/resources/models/qwen-7B-chat/configuration_qwen.py ================================================ # Copyright (c) Alibaba Cloud. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. from transformers import PretrainedConfig class QWenConfig(PretrainedConfig): model_type = "qwen" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { "hidden_size": "n_embd", "num_attention_heads": "n_head", "max_position_embeddings": "n_positions", "num_hidden_layers": "n_layer", } def __init__( self, vocab_size=151851, n_embd=4096, n_layer=32, n_head=32, n_inner=None, embd_pdrop=0.0, attn_pdrop=0.0, layer_norm_epsilon=1e-5, initializer_range=0.02, scale_attn_weights=True, use_cache=True, eos_token_id=151643, apply_residual_connection_post_layernorm=False, bf16=True, kv_channels=128, rotary_pct=1.0, rotary_emb_base=10000, use_dynamic_ntk=False, use_logn_attn=False, use_flash_attn=True, ffn_hidden_size=22016, no_bias=True, tie_word_embeddings=False, **kwargs, ): self.eos_token_id = eos_token_id super().__init__( eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs ) self.vocab_size = vocab_size self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.n_inner = n_inner self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.scale_attn_weights = scale_attn_weights self.use_cache = use_cache self.apply_residual_connection_post_layernorm = ( apply_residual_connection_post_layernorm ) self.bf16 = bf16 self.kv_channels = kv_channels self.rotary_pct = rotary_pct self.rotary_emb_base = rotary_emb_base self.use_dynamic_ntk = use_dynamic_ntk self.use_logn_attn = use_logn_attn self.use_flash_attn = use_flash_attn self.ffn_hidden_size = ffn_hidden_size self.no_bias = no_bias self.tie_word_embeddings = tie_word_embeddings ================================================ FILE: src/resources/models/qwen-7B-chat/generation_config.json ================================================ { "chat_format": "chatml", "decay_bound": 0.0, "decay_factor": 1.0, "eos_token_id": 151643, "factual_nucleus_sampling": false, "max_context_size": 1024, "max_generate_size": 512, "max_new_tokens": 512, "pad_token_id": 151643, "stop_words_ids": [[151643]], "do_sample": true, "top_k": 0, "top_p": 0.8, "transformers_version": "4.31.0" } ================================================ FILE: src/resources/models/qwen-7B-chat/modeling_qwen.py ================================================ # Copyright (c) Alibaba Cloud. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import importlib import math from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List import torch import torch.nn.functional as F import torch.utils.checkpoint from torch.cuda.amp import autocast from torch.nn import CrossEntropyLoss from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList from transformers.generation.logits_process import LogitsProcessorList if TYPE_CHECKING: from transformers.generation.streamers import BaseStreamer from transformers.generation.utils import GenerateOutput from transformers.modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, ) from transformers.modeling_utils import PreTrainedModel from transformers.utils import logging try: from einops import rearrange except ImportError: rearrange = None from torch import nn try: from flash_attn.layers.rotary import apply_rotary_emb_func from einops import rearrange use_flash_rotary = True except ImportError: use_flash_rotary = False print("Warning: import flash_attn rotary fail, please install FlashAttention rotary to get better performance " "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary") try: from flash_attn.ops.rms_norm import rms_norm except ImportError: rms_norm = None print("Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get better performance " "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm") from .configuration_qwen import QWenConfig from .qwen_generation_utils import ( HistoryType, make_context, decode_tokens, get_stop_words_ids, StopWordsLogitsProcessor, ) logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "qwen" _CONFIG_FOR_DOC = "QWenConfig" QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"] try: from flash_attn.flash_attn_interface import flash_attn_unpadded_func except ImportError: flash_attn_unpadded_func = None print("Warning: import flash_attn fail, please install FlashAttention " "https://github.com/Dao-AILab/flash-attention") class FlashSelfAttention(torch.nn.Module): def __init__( self, causal=False, softmax_scale=None, attention_dropout=0.0, ): super().__init__() assert flash_attn_unpadded_func is not None, ( "Please install FlashAttention first, " "e.g., with pip install flash-attn" ) assert ( rearrange is not None ), "Please install einops first, e.g., with pip install einops" self.causal = causal self.softmax_scale = softmax_scale self.dropout_p = attention_dropout def forward(self, q, k, v): assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v))) assert all((i.is_cuda for i in (q, k, v))) batch_size, seqlen_q = q.shape[0], q.shape[1] seqlen_k = k.shape[1] q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] cu_seqlens_q = torch.arange( 0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q.device, ) if self.training: assert seqlen_k == seqlen_q is_causal = self.causal cu_seqlens_k = cu_seqlens_q else: is_causal = seqlen_q == seqlen_k cu_seqlens_k = torch.arange( 0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=q.device, ) self.dropout_p = 0 output = flash_attn_unpadded_func( q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, self.dropout_p, softmax_scale=self.softmax_scale, causal=is_causal, ) output = rearrange(output, "(b s) ... -> b s ...", b=batch_size) return output class QWenAttention(nn.Module): def __init__(self, config, layer_number=None): super().__init__() max_positions = config.max_position_embeddings self.register_buffer( "bias", torch.tril( torch.ones((max_positions, max_positions), dtype=torch.bool) ).view(1, 1, max_positions, max_positions), persistent=False, ) self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False) self.layer_number = max(1, layer_number) self.params_dtype = config.params_dtype self.seq_length = config.seq_length self.hidden_size = config.hidden_size self.split_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads self.use_flash_attn = config.use_flash_attn self.scale_attn_weights = True self.layer_idx = None self.projection_size = config.kv_channels * config.num_attention_heads assert self.projection_size % config.num_attention_heads == 0 self.hidden_size_per_attention_head = ( self.projection_size // config.num_attention_heads ) self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size) self.c_proj = nn.Linear( config.hidden_size, self.projection_size, bias=not config.no_bias ) self.is_fp32 = not(config.bf16 or config.fp16) if self.use_flash_attn and flash_attn_unpadded_func is not None and not self.is_fp32: self.core_attention_flash = FlashSelfAttention( causal=True, attention_dropout=config.attn_pdrop ) self.bf16 = config.bf16 if config.rotary_pct == 1.0: self.rotary_ndims = None else: assert config.rotary_pct < 1 self.rotary_ndims = int( self.hidden_size_per_attention_head * config.rotary_pct ) dim = ( self.rotary_ndims if self.rotary_ndims is not None else self.hidden_size_per_attention_head ) self.rotary_emb = RotaryEmbedding( dim, base=config.rotary_emb_base ) self.use_dynamic_ntk = config.use_dynamic_ntk self.use_logn_attn = config.use_logn_attn logn_list = [math.log(i, self.seq_length) if i > self.seq_length else 1 for i in range(1, 32768)] self.logn_tensor = torch.Tensor(logn_list)[None, :, None, None] self._ntk_cached = 1.0 self.attn_dropout = nn.Dropout(config.attn_pdrop) def _attn(self, query, key, value, attention_mask=None, head_mask=None): attn_weights = torch.matmul(query, key.transpose(-1, -2)) if self.scale_attn_weights: attn_weights = attn_weights / torch.full( [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device, ) query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[ :, :, key_length - query_length : key_length, :key_length ] mask_value = torch.finfo(attn_weights.dtype).min mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to( attn_weights.device ) attn_weights = torch.where( causal_mask, attn_weights.to(attn_weights.dtype), mask_value ) attn_weights = nn.functional.softmax(attn_weights, dim=-1) attn_weights = attn_weights.type(value.dtype) attn_weights = self.attn_dropout(attn_weights) if head_mask is not None: attn_weights = attn_weights * head_mask attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2) return attn_output, attn_weights def _upcast_and_reordered_attn( self, query, key, value, attention_mask=None, head_mask=None ): bsz, num_heads, q_seq_len, dk = query.size() _, _, k_seq_len, _ = key.size() attn_weights = torch.empty( bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device, ) scale_factor = 1.0 if self.scale_attn_weights: scale_factor /= float(value.size(-1)) ** 0.5 with autocast(enabled=False): q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape( -1, dk, k_seq_len ) attn_weights = torch.baddbmm( attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor ) attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len) query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[ :, :, key_length - query_length : key_length, :key_length ] mask_value = torch.finfo(attn_weights.dtype).min mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to( attn_weights.device ) attn_weights = torch.where(causal_mask, attn_weights, mask_value) if attention_mask is not None: attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) if attn_weights.dtype != torch.float32: raise RuntimeError( "Error with upcasting, attn_weights does not have dtype torch.float32" ) attn_weights = attn_weights.type(value.dtype) attn_weights = self.attn_dropout(attn_weights) if head_mask is not None: attn_weights = attn_weights * head_mask attn_output = torch.matmul(attn_weights, value) return attn_output, attn_weights def _split_heads(self, tensor, num_heads, attn_head_size): new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) tensor = tensor.view(new_shape) return tensor def _merge_heads(self, tensor, num_heads, attn_head_size): tensor = tensor.contiguous() new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) return tensor.view(new_shape) def forward( self, hidden_states: Optional[Tuple[torch.FloatTensor]], layer_past: Optional[Tuple[torch.Tensor]] = None, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ): mixed_x_layer = self.c_attn(hidden_states) query, key, value = mixed_x_layer.split(self.split_size, dim=2) query = self._split_heads(query, self.num_heads, self.head_dim) key = self._split_heads(key, self.num_heads, self.head_dim) value = self._split_heads(value, self.num_heads, self.head_dim) kv_seq_len = hidden_states.size()[1] if layer_past: # layer past[0] shape: bs * seq_len * head_num * dim kv_seq_len += layer_past[0].shape[1] if self.use_dynamic_ntk and kv_seq_len == hidden_states.size()[1] and not self.training: context_value = math.log(kv_seq_len / self.seq_length, 2) + 1 ntk_alpha = 2 ** math.ceil(context_value) - 1 ntk_alpha = max(ntk_alpha, 1) self._ntk_cached = ntk_alpha else: ntk_alpha = self._ntk_cached rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(hidden_states.device) if rotary_pos_emb is not None: if isinstance(rotary_pos_emb, tuple): rotary_pos_emb = rotary_pos_emb else: rotary_pos_emb = (rotary_pos_emb,) * 2 if rotary_pos_emb is not None: q_pos_emb, k_pos_emb = rotary_pos_emb # Slice the pos emb for current inference cur_len = query.shape[1] q_pos_emb = q_pos_emb[:, -cur_len:, :, :] k_pos_emb = k_pos_emb[:, -cur_len:, :, :] query = apply_rotary_pos_emb(query, q_pos_emb) key = apply_rotary_pos_emb(key, k_pos_emb) if layer_past is not None: past_key, past_value = layer_past[0], layer_past[1] key = torch.cat((past_key, key), dim=1) value = torch.cat((past_value, value), dim=1) if use_cache: present = (key, value) else: present = None if self.use_logn_attn and not self.training: if self.logn_tensor.device != query.device: self.logn_tensor = self.logn_tensor.to(query.device).type_as(query) seq_start = key.size(1) - query.size(1) seq_end = key.size(1) logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :] query = query * logn_tensor.expand_as(query) if self.use_flash_attn and flash_attn_unpadded_func is not None and not self.is_fp32: q, k, v = query, key, value context_layer = self.core_attention_flash(q, k, v) context_layer = rearrange( context_layer, "b s h d -> b s (h d)" ).contiguous() else: query = query.permute(0, 2, 1, 3) key = key.permute(0, 2, 1, 3) value = value.permute(0, 2, 1, 3) attn_output, attn_weight = self._attn( query, key, value, attention_mask, head_mask ) context_layer = self._merge_heads( attn_output, self.num_heads, self.head_dim ) attn_output = self.c_proj(context_layer) outputs = (attn_output, present) if output_attentions: if self.use_flash_attn and flash_attn_unpadded_func is not None and not self.is_fp32: raise ValueError("Cannot output attentions while using flash-attn") else: outputs += (attn_weight,) return outputs class QWenMLP(nn.Module): def __init__(self, config): super().__init__() self.w1 = nn.Linear( config.hidden_size, config.ffn_hidden_size // 2, bias=not config.no_bias ) self.w2 = nn.Linear( config.hidden_size, config.ffn_hidden_size // 2, bias=not config.no_bias ) ff_dim_in = config.ffn_hidden_size // 2 self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias) def forward(self, hidden_states): a1 = self.w1(hidden_states) a2 = self.w2(hidden_states) intermediate_parallel = a1 * F.silu(a2) output = self.c_proj(intermediate_parallel) return output class QWenBlock(nn.Module): def __init__(self, config, layer_idx=None, num_expert=1): super().__init__() self.num_expert = num_expert self.layer_number = layer_idx self.apply_residual_connection_post_layernorm = ( config.apply_residual_connection_post_layernorm ) hidden_size = config.hidden_size self.apply_residual_connection_post_layernorm = ( config.apply_residual_connection_post_layernorm ) self.bf16 = config.bf16 self.ln_1 = RMSNorm( hidden_size, eps=config.layer_norm_epsilon, ) self.attn = QWenAttention(config, layer_number=layer_idx) self.ln_2 = RMSNorm( hidden_size, eps=config.layer_norm_epsilon, ) self.mlp = QWenMLP(config) def forward( self, hidden_states: Optional[Tuple[torch.FloatTensor]], layer_past: Optional[Tuple[torch.Tensor]] = None, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, ): layernorm_output = self.ln_1(hidden_states) attn_outputs = self.attn( layernorm_output, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, output_attentions=output_attentions, ) attn_output = attn_outputs[0] outputs = attn_outputs[1:] if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = hidden_states layernorm_input = attn_output + residual layernorm_output = self.ln_2(layernorm_input) if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = layernorm_input mlp_output = self.mlp(layernorm_output) hidden_states = residual + mlp_output if use_cache: outputs = (hidden_states,) + outputs else: outputs = (hidden_states,) + outputs[1:] return outputs class QWenPreTrainedModel(PreTrainedModel): config_class = QWenConfig base_model_prefix = "transformer" is_parallelizable = False supports_gradient_checkpointing = True _no_split_modules = ["QWenBlock"] def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, RMSNorm): module.weight.data.fill_(1.0) for name, p in module.named_parameters(): if name == "c_proj.weight": p.data.normal_( mean=0.0, std=( self.config.initializer_range / math.sqrt(2 * self.config.n_layer) ), ) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, QWenModel): module.gradient_checkpointing = value class QWenModel(QWenPreTrainedModel): _keys_to_ignore_on_load_missing = ["attn.masked_bias"] def __init__(self, config): super().__init__(config) self.vocab_size = config.padded_vocab_size self.num_hidden_layers = config.num_hidden_layers self.embed_dim = config.hidden_size max_sequence_length = config.max_position_embeddings self.position_embedding_type = config.pos_emb self.gradient_checkpointing = False if self.position_embedding_type == "learned": self.wpe = nn.Embedding(max_sequence_length, self.embed_dim) self.init_method(self.position_embeddings.weight) self._position_embeddings_key = "position_embeddings" self.init_method(self.position_embeddings.weight) else: self.wpe = None self._position_embeddings_key = "" self.wte = nn.Embedding(self.vocab_size, self.embed_dim) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList( [ QWenBlock( config, layer_idx=i, ) for i in range(config.num_hidden_layers) ] ) self.ln_f = RMSNorm( self.embed_dim, eps=config.layer_norm_epsilon, ) self.post_init() def get_input_embeddings(self): return self.wte def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings def forward( self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past_key_values is None: past_length = 0 past_key_values = tuple([None] * len(self.h)) else: past_length = past_key_values[0][0].size(-2) if position_ids is None: position_ids = torch.arange( past_length, input_shape[-1] + past_length, dtype=torch.long, device=device, ) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) if attention_mask is not None: if batch_size <= 0: raise ValueError("batch_size has to be defined and > 0") attention_mask = attention_mask.view(batch_size, -1) attention_mask = attention_mask[:, None, None, :] attention_mask = attention_mask.to(dtype=self.dtype) attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min encoder_attention_mask = None head_mask = self.get_head_mask(head_mask, self.config.n_layer) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) hidden_states = inputs_embeds if self.wpe is not None: position_embeds = self.wpe(position_ids) hidden_states = hidden_states + position_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False presents = () if use_cache else None all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs, use_cache, output_attentions) return custom_forward outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask, ) else: outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = outputs[0] if use_cache is True: presents = presents + (outputs[2 if output_attentions else 1],) if output_attentions: all_self_attentions = all_self_attentions + (outputs[1],) hidden_states = self.ln_f(hidden_states) hidden_states = hidden_states.view(output_shape) if not return_dict: return tuple( v for v in [hidden_states, presents, all_hidden_states] if v is not None ) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, ) class QWenLMHeadModel(QWenPreTrainedModel): _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"] _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"] def __init__(self, config): super().__init__(config) self.transformer = QWenModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) assert not(config.bf16 and config.fp16), ("In config, bf16 and fp16 cannot both be true") if config.bf16: self.transformer.bfloat16() self.lm_head.bfloat16() if config.fp16: self.transformer.half() self.lm_head.half() self.post_init() def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def prepare_inputs_for_generation( self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs ): token_type_ids = kwargs.get("token_type_ids", None) if past_key_values: input_ids = input_ids[:, -1].unsqueeze(-1) if token_type_ids is not None: token_type_ids = token_type_ids[:, -1].unsqueeze(-1) attention_mask = kwargs.get("attention_mask", None) position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past_key_values: position_ids = position_ids[:, -1].unsqueeze(-1) else: position_ids = None if inputs_embeds is not None and past_key_values is None: model_inputs = {"inputs_embeds": inputs_embeds} else: model_inputs = {"input_ids": input_ids} model_inputs.update( { "past_key_values": past_key_values, "use_cache": kwargs.get("use_cache"), "position_ids": position_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, } ) return model_inputs def forward( self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) loss = None if labels is not None: labels = labels.to(lm_logits.device) shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) ) if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) @staticmethod def _reorder_cache( past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: return tuple( tuple( past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past ) for layer_past in past_key_values ) def chat( self, tokenizer: PreTrainedTokenizer, query: str, history: Optional[HistoryType], system: str = "You are a helpful assistant.", append_history: bool = True, ) -> Tuple[str, HistoryType]: if history is None: history = [] raw_text, context_tokens = make_context( tokenizer, query, history=history, system=system, max_window_size=6144, chat_format=self.generation_config.chat_format, ) stop_words_ids = get_stop_words_ids( self.generation_config.chat_format, tokenizer ) input_ids = torch.tensor([context_tokens]).to(self.device) outputs = self.generate( input_ids, stop_words_ids=stop_words_ids, return_dict_in_generate=False, ) response = decode_tokens( outputs[0], tokenizer, raw_text_len=len(raw_text), context_length=len(context_tokens), chat_format=self.generation_config.chat_format, verbose=False, ) if append_history: history.append((query, response)) return response, history def generate( self, inputs: Optional[torch.Tensor] = None, generation_config: Optional[GenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, synced_gpus: Optional[bool] = None, streamer: Optional["BaseStreamer"] = None, **kwargs, ) -> Union[GenerateOutput, torch.LongTensor]: # Process stop_words_ids. stop_words_ids = kwargs.pop('stop_words_ids', None) if stop_words_ids is None and generation_config is not None: stop_words_ids = getattr(generation_config, 'stop_words_ids', None) if stop_words_ids is None: stop_words_ids = getattr(self.generation_config, 'stop_words_ids', None) if stop_words_ids is not None: stop_words_logits_processor = StopWordsLogitsProcessor( stop_words_ids=stop_words_ids, eos_token_id=self.generation_config.eos_token_id) if logits_processor is None: logits_processor = LogitsProcessorList([stop_words_logits_processor]) else: logits_processor.append(stop_words_logits_processor) return super().generate( inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, streamer, **kwargs, ) class RotaryEmbedding(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() self.dim = dim self.base = base self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) if importlib.util.find_spec("einops") is None: raise RuntimeError("einops is required for Rotary Embedding") self._rotary_pos_emb_cache = None self._seq_len_cached = 0 self._ntk_alpha_cached = 1.0 def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0): seqlen = max_seq_len + offset if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached: base = self.base * ntk_alpha ** (self.dim / (self.dim - 2)) self.inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=self.inv_freq.device).float() / self.dim)) self._seq_len_cached = seqlen self._ntk_alpha_cached = ntk_alpha seq = torch.arange(seqlen, device=self.inv_freq.device) freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq) emb = torch.cat((freqs, freqs), dim=-1) from einops import rearrange self._rotary_pos_emb_cache = rearrange(emb, "n d -> 1 n 1 d") def forward(self, max_seq_len, offset=0, ntk_alpha=1.0): self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha) return self._rotary_pos_emb_cache[:, offset : offset + max_seq_len] def _rotate_half(x): from einops import rearrange x = rearrange(x, "... (j d) -> ... j d", j=2) x1, x2 = x.unbind(dim=-2) return torch.cat((-x2, x1), dim=-1) def apply_rotary_pos_emb(t, freqs, use_flash_rotary=False): if use_flash_rotary: t_ = t.float() freqs = freqs.squeeze(0).squeeze(1) cos = freqs[:, : freqs.shape[-1] // 2].cos() sin = freqs[:, : freqs.shape[-1] // 2].sin() output = apply_rotary_emb_func(t_, cos, sin).type_as(t) return output else: rot_dim = freqs.shape[-1] t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:] t_ = t_.float() t_pass_ = t_pass_.float() t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin()) return torch.cat((t_, t_pass_), dim=-1).type_as(t) class RMSNorm(torch.nn.Module): def __init__(self, dim: int, eps: float = 1e-6): super().__init__() self.eps = eps self.weight = nn.Parameter(torch.ones(dim)) def _norm(self, x): return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) def forward(self, x): if rms_norm is not None: return rms_norm(x, self.weight, self.eps) else: output = self._norm(x.float()).type_as(x) return output * self.weight ================================================ FILE: src/resources/models/qwen-7B-chat/qwen_generation_utils.py ================================================ # Copyright (c) Alibaba Cloud. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """Generation support.""" from typing import Tuple, List, Union, Iterable import numpy as np import torch import torch.nn.functional as F from transformers import PreTrainedTokenizer from transformers import logging from transformers.generation import LogitsProcessor logger = logging.get_logger(__name__) # Types. HistoryType = List[Tuple[str, str]] TokensType = List[int] BatchTokensType = List[List[int]] def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType: for tokens in batch: context_length = len(tokens) if context_length < seq_length: tokens.extend([pad_id] * (seq_length - context_length)) return batch def get_ltor_masks_and_position_ids( data, eod_token, reset_position_ids, reset_attention_mask, eod_mask_loss, ): """Build masks and position id for left to right model.""" # Extract batch size and sequence length. micro_batch_size, seq_length = data.size() # Attention mask (lower triangular). if reset_attention_mask: att_mask_batch = micro_batch_size else: att_mask_batch = 1 attention_mask = torch.tril( torch.ones((att_mask_batch, seq_length, seq_length), device=data.device) ).view(att_mask_batch, 1, seq_length, seq_length) # Loss mask. loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) if eod_mask_loss: loss_mask[data == eod_token] = 0.0 # Position ids. position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device) position_ids = position_ids.unsqueeze(0).expand_as(data) # We need to clone as the ids will be modifed based on batch index. if reset_position_ids: position_ids = position_ids.clone() if reset_position_ids or reset_attention_mask: # Loop through the batches: for b in range(micro_batch_size): # Find indecies where EOD token is. eod_index = position_ids[b, data[b] == eod_token] # Detach indecies from positions if going to modify positions. if reset_position_ids: eod_index = eod_index.clone() # Loop through EOD indecies: prev_index = 0 for j in range(eod_index.size()[0]): i = eod_index[j] # Mask attention loss. if reset_attention_mask: attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0 # Reset positions. if reset_position_ids: position_ids[b, (i + 1) :] -= i + 1 - prev_index prev_index = i + 1 # Convert attention mask to binary: attention_mask = attention_mask < 0.5 return attention_mask, loss_mask, position_ids def get_batch(context_tokens: torch.LongTensor, eod_id: int): """Generate batch from context tokens.""" # Move to GPU. tokens = context_tokens.contiguous().to(context_tokens.device) # Get the attention mask and postition ids. attention_mask, _, position_ids = get_ltor_masks_and_position_ids( tokens, eod_id, reset_position_ids=False, reset_attention_mask=False, eod_mask_loss=False, ) return tokens, attention_mask, position_ids def get_stop_words_ids(chat_format, tokenizer): if chat_format == "raw": stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]] elif chat_format == "chatml": stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]] else: raise NotImplementedError(f"Unknown chat format {chat_format!r}") return stop_words_ids def make_context( tokenizer: PreTrainedTokenizer, query: str, history: List[Tuple[str, str]] = None, system: str = "", max_window_size: int = 6144, chat_format: str = "chatml", ): if history is None: history = [] if chat_format == "chatml": im_start, im_end = "<|im_start|>", "<|im_end|>" im_start_tokens = [tokenizer.im_start_id] im_end_tokens = [tokenizer.im_end_id] nl_tokens = tokenizer.encode("\n", add_special_tokens=False) def _tokenize_str(role, content): return f"{role}\n{content}", tokenizer.encode( role, add_special_tokens=False ) + nl_tokens + tokenizer.encode(content, add_special_tokens=False) system_text, system_tokens_part = _tokenize_str("system", system) system_tokens = im_start_tokens + system_tokens_part + im_end_tokens raw_text = "" context_tokens = [] for turn_query, turn_response in reversed(history): query_text, query_tokens_part = _tokenize_str("user", turn_query) query_tokens = im_start_tokens + query_tokens_part + im_end_tokens response_text, response_tokens_part = _tokenize_str( "assistant", turn_response ) response_tokens = im_start_tokens + response_tokens_part + im_end_tokens next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens prev_chat = ( f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}" ) current_context_size = ( len(system_tokens) + len(next_context_tokens) + len(context_tokens) ) if current_context_size < max_window_size: context_tokens = next_context_tokens + context_tokens raw_text = prev_chat + raw_text else: break context_tokens = system_tokens + context_tokens raw_text = f"{im_start}{system_text}{im_end}" + raw_text context_tokens += ( nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens + im_start_tokens + tokenizer.encode("assistant", add_special_tokens=False) + nl_tokens ) raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" elif chat_format == "raw": raw_text = query context_tokens = tokenizer.encode(raw_text) else: raise NotImplementedError(f"Unknown chat format {chat_format!r}") return raw_text, context_tokens def _decode_default( tokens: List[int], *, stop_words: List[str], eod_words: List[str], tokenizer: PreTrainedTokenizer, raw_text_len: int, verbose: bool = False, return_end_reason: bool = False, ): trim_decode_tokens = tokenizer.decode(tokens)[raw_text_len:] if verbose: print("\nRaw Generate: ", trim_decode_tokens) end_reason = f"Gen length {len(tokens)}" for stop_word in stop_words: trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip() for eod_word in eod_words: if eod_word in trim_decode_tokens: end_reason = f"Gen {eod_word!r}" trim_decode_tokens = trim_decode_tokens.split(eod_word)[0] trim_decode_tokens = trim_decode_tokens.strip() if verbose: print("\nEnd Reason:", end_reason) print("\nGenerate: ", trim_decode_tokens) if return_end_reason: return trim_decode_tokens, end_reason else: return trim_decode_tokens def _decode_chatml( tokens: List[int], *, stop_words: List[str], eod_token_ids: List[int], tokenizer: PreTrainedTokenizer, raw_text_len: int, context_length: int, verbose: bool = False, return_end_reason: bool = False, ): end_reason = f"Gen length {len(tokens)}" eod_token_idx = context_length for eod_token_idx in range(context_length, len(tokens)): if tokens[eod_token_idx] in eod_token_ids: end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}" break trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx])[raw_text_len:] if verbose: print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens)[raw_text_len:]) print("\nRaw Generate:", trim_decode_tokens) print("\nEnd Reason:", end_reason) for stop_word in stop_words: trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip() trim_decode_tokens = trim_decode_tokens.strip() if verbose: print("\nGenerate:", trim_decode_tokens) if return_end_reason: return trim_decode_tokens, end_reason else: return trim_decode_tokens def decode_tokens( tokens: Union[torch.LongTensor, TokensType], tokenizer: PreTrainedTokenizer, raw_text_len: int, context_length: int, chat_format: str, verbose: bool = False, return_end_reason: bool = False, ) -> str: if torch.is_tensor(tokens): tokens = tokens.cpu().numpy().tolist() if chat_format == "chatml": return _decode_chatml( tokens, stop_words=[], eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id], tokenizer=tokenizer, raw_text_len=raw_text_len, context_length=context_length, verbose=verbose, return_end_reason=return_end_reason, ) elif chat_format == "raw": return _decode_default( tokens, stop_words=["<|endoftext|>"], eod_words=["<|endoftext|>"], tokenizer=tokenizer, raw_text_len=raw_text_len, verbose=verbose, return_end_reason=return_end_reason, ) else: raise NotImplementedError(f"Unknown chat format {chat_format!r}") class StopWordsLogitsProcessor(LogitsProcessor): """ :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration. Args: stop_words_ids (:obj:`List[List[int]]`): List of list of token ids of stop ids. In order to get the tokens of the words that should not appear in the generated text, use :obj:`tokenizer(bad_word, add_prefix_space=True).input_ids`. eos_token_id (:obj:`int`): The id of the `end-of-sequence` token. """ def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int): if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0: raise ValueError( f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}." ) if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids): raise ValueError( f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}." ) if any( any( (not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in stop_word_ids ) for stop_word_ids in stop_words_ids ): raise ValueError( f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}." ) self.stop_words_ids = list( filter( lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids ) ) self.eos_token_id = eos_token_id for stop_token_seq in self.stop_words_ids: assert ( len(stop_token_seq) > 0 ), "Stop words token sequences {} cannot have an empty list".format( stop_words_ids ) def __call__( self, input_ids: torch.LongTensor, scores: torch.FloatTensor ) -> torch.FloatTensor: stopped_samples = self._calc_stopped_samples(input_ids) for i, should_stop in enumerate(stopped_samples): if should_stop: scores[i, self.eos_token_id] = float(2**15) return scores def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool: if len(tokens) == 0: # if bad word tokens is just one token always ban it return True elif len(tokens) > len(prev_tokens): # if bad word tokens are longer then prev input_ids they can't be equal return False elif prev_tokens[-len(tokens) :].tolist() == tokens: # if tokens match return True else: return False def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]: stopped_samples = [] for prev_input_ids_slice in prev_input_ids: match = False for stop_token_seq in self.stop_words_ids: if self._tokens_match(prev_input_ids_slice, stop_token_seq): # if tokens do not match continue match = True break stopped_samples.append(match) return stopped_samples def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")): """This function has been mostly taken from huggingface conversational ai code at https://medium.com/huggingface/how-to-build-a-state-of-the-art- conversational-ai-with-transfer-learning-2d818ac26313""" if top_k > 0: # Remove all tokens with a probability less than the # last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p > 0.0: # Cconvert to 1D sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold sorted_indices_to_remove = cumulative_probs > top_p # Shift the indices to the right to keep also the first token # above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 for i in range(sorted_indices.size(0)): indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]] logits[i][indices_to_remove] = filter_value return logits def switch(val1, val2, boolean): boolean = boolean.type_as(val1) return (1 - boolean) * val1 + boolean * val2 ================================================ FILE: src/resources/models/qwen-7B-chat/tokenization_qwen.py ================================================ # Copyright (c) Alibaba Cloud. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """Tokenization classes for QWen.""" from __future__ import absolute_import, division, print_function, unicode_literals import json import logging import os import unicodedata from io import open import base64 import tiktoken from typing import List, Optional, Tuple, Union from transformers import PreTrainedTokenizer, AddedToken logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"} class QWenTokenizer(PreTrainedTokenizer): """QWen tokenizer.""" """NOTE: This tokenizer will not handle special tokens to avoid injection attacks""" vocab_files_names = VOCAB_FILES_NAMES def __init__( self, vocab_file, errors="replace", max_len=None, unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", pad_token=None, add_prefix_space=False, add_bos_token=False, add_more_sp_tokens=True, **kwargs, ): bos_token = ( AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token ) eos_token = ( AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token ) unk_token = ( AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token ) pad_token = ( AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token ) super().__init__( errors=errors, unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, add_prefix_space=add_prefix_space, add_bos_token=add_bos_token, ) self.add_bos_token = add_bos_token self.max_len = max_len if max_len is not None else int(1e12) self.errors = errors # how to handle errors in decoding name = "Qwen" ENDOFTEXT = "<|endoftext|>" IMSTART = "<|im_start|>" IMEND = "<|im_end|>" if add_more_sp_tokens: special_tokens = ( ENDOFTEXT, IMSTART, IMEND, "", "", "", "", "", ) + tuple([f"" for i in range(200)]) else: special_tokens = (ENDOFTEXT, IMSTART, IMEND) PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" def load_tiktoken_bpe(tiktoken_bpe_file: str) -> "dict[bytes, int]": contents = open(tiktoken_bpe_file, "rb").read() return { base64.b64decode(token): int(rank) for token, rank in ( line.split() for line in contents.splitlines() if line ) } mergeable_ranks = load_tiktoken_bpe(vocab_file) special_tokens = { token: index for index, token in enumerate(special_tokens, start=len(mergeable_ranks)) } self.special_tokens = special_tokens enc = tiktoken.Encoding( name, pat_str=PAT_STR, mergeable_ranks=mergeable_ranks, special_tokens=special_tokens, ) assert ( len(mergeable_ranks) + len(special_tokens) == enc.n_vocab ), f"{len(mergeable_ranks) + len(special_tokens)} != {enc.n_vocab} in encoding" self.mergeable_ranks = mergeable_ranks self.encoder = self.mergeable_ranks self.decoder = {v: k for k, v in self.encoder.items()} self.tokenizer = enc # type: tiktoken.Encoding self.eod_id = self.tokenizer.eot_token self.im_start_id = special_tokens[IMSTART] self.im_end_id = special_tokens[IMEND] def __len__(self): return self.tokenizer.n_vocab def get_vocab(self): return self.mergeable_ranks def convert_tokens_to_ids(self, tokens): ids = [] # Remove support for py2 if isinstance(tokens, str): if tokens in self.special_tokens: return self.special_tokens[tokens] else: return self.encoder.get(tokens) for token in tokens: if token in self.special_tokens: ids.append(self.special_tokens[token]) else: ids.append(self.encoder.get(token)) if len(ids) > self.max_len: logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this model ({} > {}). Running this" " sequence through the model will result in indexing errors".format( len(ids), self.max_len ) ) return ids def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]: """ Save only the vocabulary of the tokenizer (vocabulary + added tokens). Returns: `Tuple(str)`: Paths to the files saved. """ file_path = os.path.join(save_directory, "qwen.tiktoken") with open(file_path, "w", encoding="utf8") as w: for k, v in self.mergeable_ranks.items(): line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n" w.write(line) return (file_path,) def tokenize(self, text: str, **kwargs) -> List[str]: """ Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`. Args: text (`str`): The sequence to be encoded. kwargs (additional keyword arguments, *optional*): Will be passed to the underlying model specific encode method. See details in [`~PreTrainedTokenizerBase.__call__`] Returns: `List[str]`: The list of tokens. """ tokens = [] text = unicodedata.normalize("NFC", text) for t in self.tokenizer.encode_ordinary(text): tokens.append(self.decoder[t]) return tokens def convert_tokens_to_string(self, tokens: List[str]) -> str: """ Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we often want to remove sub-word tokenization artifacts at the same time. """ text = "".join(tokens) text = bytearray([self.byte_decoder[c] for c in text]).decode( "utf-8", errors=self.errors ) return text @property def vocab_size(self): return self.tokenizer.n_vocab def _convert_id_to_token(self, index: int) -> str: if index >= self.tokenizer.n_vocab: return self.unk_token return self.tokenizer.decode([index]) def _convert_token_to_id(self, token: str) -> int: """Converts a token to an id using the vocab.""" return self.encoder.get(token.encode('UTF-8'), self.tokenizer.encode(self.unk_token, allowed_special='all')[0]) @property def all_special_tokens(self) -> List[str]: """ `List[str]`: All the special tokens (`''`, `''`, etc.) mapped to class attributes. Convert tokens of `tokenizers.AddedToken` type to string. """ all_toks = [str(s) for s in self.special_tokens.keys()] return all_toks @property def all_special_ids(self) -> List[int]: """ `List[int]`: List the ids of the special tokens(`''`, `''`, etc.) mapped to class attributes. """ all_ids = [v for v in self.special_tokens.values()] return all_ids def _tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Do NOT take care of added tokens. """ raise NotImplementedError def _decode( self, token_ids: Union[int, List[int]], skip_special_tokens: bool = False, **kwargs, ) -> str: if isinstance(token_ids, int): token_ids = [token_ids] if skip_special_tokens: token_ids = [i for i in token_ids if i not in self.all_special_ids] return self.tokenizer.decode(token_ids) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): # bos_token_id = [self.im_start_id] # eos_token_id = [self.im_end_id] bos_token_id = [self.bos_token_id] eos_token_id = [self.eos_token_id] line_break_token_id = [self._convert_token_to_id("\n")] output = bos_token_id + token_ids_0 + eos_token_id + line_break_token_id if token_ids_1 is not None: output += bos_token_id + token_ids_1 + eos_token_id + line_break_token_id return output ================================================ FILE: src/resources/models/qwen-7B-chat/tokenizer_config.json ================================================ { "remove_space": false, "do_lower_case": false, "tokenizer_class": "QWenTokenizer", "auto_map": { "AutoTokenizer": [ "tokenization_qwen.QWenTokenizer", null ] }, "bos_token": { "__type": "AddedToken", "content": "<|im_start|>", "lstrip": false, "normalized": true, "rstrip": false, "single_word": false }, "eos_token": { "__type": "AddedToken", "content": "<|im_end|>", "lstrip": false, "normalized": true, "rstrip": false, "single_word": false } } ================================================ FILE: src/train_dpo.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import argparse import evaluate import torch import copy from torch.utils.data import SequentialSampler, DataLoader from tqdm import tqdm from transformers import ( TrainingArguments, default_data_collator, ) from src.utils import RESOURCE_PATH, load_tokenizer_and_model, load_checkpoint from src.data.data import DPODataset, SFTDataset from src.utils.file_utils import set_seed, print_rank_0 from src.models.trainer import DPOTrainer # Create a preprocessing function to extract out the proper logits from the model output def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): logits = logits[0] return logits.argmax(dim=-1) def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--tokenizer_path", type=str, required=True) parser.add_argument("--model_name_or_path", type=str, required=True) parser.add_argument("--reference_model_name_or_path", type=str, default=None) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--max_length", type=int, default=1024) parser.add_argument("--max_length_prompt", type=int, default=512) parser.add_argument("--max_length_generation", type=int, default=None) parser.add_argument("--bits", type=int, default=32, help="bits used to load model, including: 32, 16, 8, 4") parser.add_argument("--device_map", type=str, default=None, help="device map to allocate model," "[None] means cpu" "[0, 1, 2, ...], number means single-card" "[auto, balanced, balanced_low_0] means multi-card") parser.add_argument("--low_cpu_mem_usage", action="store_true", help="whether to enable low cpu memory usage" "when loading model") # train parser.add_argument("--do_train", action="store_true") parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--num_epochs", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-6) parser.add_argument("--beta", type=float, default=0.1, help="the beta parameter for DPO loss") parser.add_argument("--lr_scheduler_type", type=str, default="cosine", help="transformers.trainer_utils.SchedulerType, including:" "linear, cosine, cosine_with_restarts, polynomial, constant," "constant_with_warmup") parser.add_argument("--train_batch_size", type=int, default=4) parser.add_argument("--weight_decay", type=float, default=0.1) parser.add_argument("--warmup_ratio", type=int, default=0.1) parser.add_argument("--logging_steps", type=int, default=100) parser.add_argument("--save_strategy", type=str, default="steps", help='- `"no"`: No save is done during training.' '- `"epoch"`: Save is done at the end of each epoch.' '- `"steps"`: Save is done every `save_steps`.') parser.add_argument("--save_steps", type=int, default=1000) parser.add_argument("--save_total_limit", type=int, default=2) parser.add_argument("--metric_for_best_model", type=str, default=None) parser.add_argument("--gradient_accumulation_steps", type=int, default=8) parser.add_argument("--gradient_checkpointing", action="store_true", help="If True, use gradient checkpointing to save memory at the expense of slower backward pass.") parser.add_argument("--deepspeed_config", type=str, default=None) parser.add_argument("--lora_rank", type=int, default=0) parser.add_argument("--lora_alpha", type=int, default=1) parser.add_argument("--lora_train_bias", type=str, default="none") # eval parser.add_argument("--do_eval", action="store_true") parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--evaluation_strategy", type=str, default="steps", help='- `"no"`: No evaluation is done during training.' '- `"steps"`: Evaluation is done (and logged) every `eval_steps`.' '- `"epoch"`: Evaluation is done at the end of each epoch.') parser.add_argument("--eval_steps", type=int, default=100) parser.add_argument("--eval_accumulation_steps", type=int, default=1) # pred parser.add_argument("--do_pred", action="store_true") parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--test_filename", type=str, default=None) parser.add_argument("--output_filename", type=str, default=None) parser.add_argument("--do_sample", action="store_true") parser.add_argument("--num_return_sequences", type=int, default=1) parser.add_argument("--top_k", type=int, default=None) parser.add_argument("--top_p", type=float, default=None) parser.add_argument("--temperature", type=float, default=None) args = parser.parse_args() return args def main(): args = get_parser() print_rank_0(f"Parameters: {args}") set_seed(args.seed) # load tokenizer and model tokenizer, model, eos_token_id = load_tokenizer_and_model(args) if args.checkpoint is not None: load_checkpoint(args, model, strict=False) print_rank_0(f"Finished loading model and tokenizer") # Set up the datasets if args.do_train: train_dataset = DPODataset(args, os.path.join(args.data_dir, args.train_filename), tokenizer) else: train_dataset = None if args.do_eval: dev_dataset = DPODataset(args, os.path.join(args.data_dir, args.eval_filename), tokenizer) else: dev_dataset = None if args.do_train: if torch.cuda.is_available(): bf16 = torch.cuda.get_device_capability()[0] >= 8 fp16 = not bf16 else: fp16 = False bf16 = False # training arguments deepspeed_config = os.path.join(RESOURCE_PATH, "config", "deepspeed", args.deepspeed_config) if args.deepspeed_config is not None else None training_args = TrainingArguments( output_dir=args.output_dir, no_cuda=not torch.cuda.is_available(), seed=args.seed, data_seed=args.seed, local_rank=args.local_rank, do_train=args.do_train, num_train_epochs=args.num_epochs, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, per_device_train_batch_size=args.train_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, warmup_ratio=args.warmup_ratio, weight_decay=args.weight_decay, half_precision_backend="auto", fp16=fp16, bf16=bf16, optim="paged_adamw_8bit", # adam_beta1=0.9, # adam_beta2=0.95, save_strategy=args.save_strategy, save_steps=args.save_steps, save_total_limit=args.save_total_limit, metric_for_best_model=args.metric_for_best_model, greater_is_better=True, logging_steps=args.logging_steps, report_to=["tensorboard"], deepspeed=deepspeed_config, gradient_checkpointing=args.gradient_checkpointing, do_eval=args.do_eval, evaluation_strategy=args.evaluation_strategy, eval_steps=args.eval_steps, eval_accumulation_steps=args.eval_accumulation_steps, per_device_eval_batch_size=args.eval_batch_size, # do_predict=args.do_pred, # use_legacy_prediction_loop=args.do_pred, remove_unused_columns=False, ) print_rank_0(f"Training Arguments: {training_args}") # load reference model or precomputed reference result if args.output_filename is not None: logps = torch.load(os.path.join(args.output_dir, args.output_filename)) ref_model = None else: logps = None ref_args = copy.deepcopy(args) ref_args.device_map = "auto" if args.reference_model_name_or_path is not None: ref_args.model_name_or_path = args.reference_model_name_or_path else: ref_args.bits = 4 _, ref_model, _ = load_tokenizer_and_model(ref_args) ref_model.eval() # Prepare the trainer and start training trainer = DPOTrainer( model=model, ref_model=ref_model, logps=logps, args=training_args, beta=args.beta, train_dataset=train_dataset, eval_dataset=dev_dataset, tokenizer=tokenizer, data_collator=default_data_collator, # compute_metrics=compute_metrics, # preprocess_logits_for_metrics=preprocess_logits_for_metrics, label_pad_token_id=tokenizer.pad_token_id ) # model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train() trainer.save_model(args.output_dir) elif args.do_eval: # res = trainer.evaluate(eval_dataset=dev_dataset) # print_rank_0(res) pass if args.do_pred: def _get_batch_logps( logits: torch.FloatTensor, labels: torch.LongTensor, average_log_prob: bool = False, ) -> torch.FloatTensor: """Compute the log probabilities of the given labels under the given logits. Args: logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size) labels: Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are ignored. Shape: (batch_size, sequence_length) average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens. Returns: A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits. """ if logits.shape[:-1] != labels.shape: raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.") labels = labels[:, 1:].clone() logits = logits[:, :-1, :] loss_mask = labels != tokenizer.pad_token_id # dummy token; we'll ignore the losses on these tokens later labels[labels == tokenizer.pad_token_id] = 0 per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2) if average_log_prob: return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) else: return (per_token_logps * loss_mask).sum(-1) model.eval() device = f"cuda:{args.local_rank}" if torch.cuda.is_available() and args.device_map is not None else "cpu" logps = dict() for test_filename in args.test_filename.split(","): if "train" in test_filename: mode = "train" else: mode = "eval" logps[mode] = dict() test_filename = os.path.join(args.data_dir, test_filename) test_dataset = DPODataset(args, test_filename, tokenizer) sampler = SequentialSampler(test_dataset) test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, sampler=sampler) with torch.no_grad(): for batch in tqdm(test_loader, desc=f"Prediction on {mode}"): indices = batch['index'].tolist() chosen_input_ids = batch['chosen_input_ids'].to(device) chosen_attention_mask = batch['chosen_attention_mask'].to(device) if 'chosen_attention_mask' in batch else None rejected_input_ids = batch['rejected_input_ids'].to(device) rejected_attention_mask = batch['rejected_attention_mask'].to(device) if 'rejected_attention_mask' in batch else None chosen_logits = model(chosen_input_ids, chosen_attention_mask).logits.detach().cpu().to(torch.float32) chosen_logps = _get_batch_logps(chosen_logits, batch["chosen_labels"], average_log_prob=False) rejected_logits = model(rejected_input_ids, rejected_attention_mask).logits.detach().cpu().to(torch.float32) rejected_logps = _get_batch_logps(rejected_logits, batch["rejected_labels"], average_log_prob=False) for index, chosen_logp, rejected_logp in zip(indices, chosen_logps, rejected_logps): logps[mode][index] = {"chosen_logp": chosen_logp, "rejected_logp": rejected_logp} torch.save(logps, os.path.join(args.output_dir, args.output_filename)) if __name__ == "__main__": main() ================================================ FILE: src/train_reward.py ================================================ import sys sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") sys.path.insert(0, "/Users/zeyesun/Documents/Code/RLHF") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import torch import argparse from tqdm import tqdm from transformers import ( Trainer, TrainingArguments, default_data_collator, ) from torch.utils.data import DataLoader, SequentialSampler from src.models.reward import RewardModel from src.utils import logger, RESOURCE_PATH, load_tokenizer_and_model, load_checkpoint from src.utils.file_utils import set_seed, print_rank_0 from src.data.data import SFTDataset, PairwiseDataset, DataCollatorReward def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--model_name_or_path", type=str, required=True) parser.add_argument("--tokenizer_path", type=str, required=True) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--max_length", type=int, default=1024) parser.add_argument("--bits", type=int, default=16) parser.add_argument("--device_map", type=str, default=None, help="device map to allocate model," "[None] means cpu" "[0, 1, 2, ...], number means single-card" "[auto, balanced, balanced_low_0] means multi-card") parser.add_argument("--low_cpu_mem_usage", action="store_true", help="whether to enable low cpu memory usage" "when loading model") # train parser.add_argument("--do_train", action="store_true") parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--num_epochs", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-6) parser.add_argument("--lr_scheduler_type", type=str, default="cosine", help="transformers.trainer_utils.SchedulerType, including:" "linear, cosine, cosine_with_restarts, polynomial, constant," "constant_with_warmup") parser.add_argument("--train_batch_size", type=int, default=4) parser.add_argument("--freeze_ratio", type=float, default=0.0, help="ratio of layers frozen for reward training") parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--warmup_ratio", type=int, default=0.1) parser.add_argument("--logging_steps", type=int, default=100) parser.add_argument("--save_strategy", type=str, default="steps", help='- `"no"`: No save is done during training.' '- `"epoch"`: Save is done at the end of each epoch.' '- `"steps"`: Save is done every `save_steps`.') parser.add_argument("--save_steps", type=int, default=1000) parser.add_argument("--save_total_limit", type=int, default=2) parser.add_argument("--metric_for_best_model", type=str, default=None) parser.add_argument("--gradient_accumulation_steps", type=int, default=8) parser.add_argument("--max_grad_norm", type=float, default=1.0) parser.add_argument("--gradient_checkpointing", action="store_true", help="If True, use gradient checkpointing to save memory at the expense of slower backward pass.") parser.add_argument("--deepspeed_config", type=str, default=None) parser.add_argument("--lora_rank", type=int, default=0) parser.add_argument("--lora_alpha", type=int, default=1) parser.add_argument("--lora_train_bias", type=str, default="none") # eval parser.add_argument("--do_eval", action="store_true") parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--evaluation_strategy", type=str, default="steps", help='- `"no"`: No evaluation is done during training.' '- `"steps"`: Evaluation is done (and logged) every `eval_steps`.' '- `"epoch"`: Evaluation is done at the end of each epoch.') parser.add_argument("--eval_steps", type=int, default=100) parser.add_argument("--eval_accumulation_steps", type=int, default=1) # pred parser.add_argument("--do_pred", action="store_true") parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--test_filename", type=str, default=None) parser.add_argument("--output_filename", type=str, default=None) args = parser.parse_args() return args def main(): args = get_parser() print_rank_0(f"Parameters: {args}") set_seed(args.seed) # load model and tokenizer tokenizer, model, eos_token_id = load_tokenizer_and_model(args) # Initialize the reward model from the (supervised) fine-tuned SFT model if "pangu" in args.model_name_or_path.lower(): reward_model = RewardModel(model.config, model.transformer, tokenizer) elif "chatglm" in args.model_name_or_path.lower(): reward_model = RewardModel(model.config, model.transformer, tokenizer) elif "glm" in args.model_name_or_path.lower(): reward_model = RewardModel(model.config, model.glm, tokenizer) else: reward_model = RewardModel(model.config, model.model, tokenizer) assert model.config.pad_token_id == tokenizer.pad_token_id if args.checkpoint is not None: load_checkpoint(args, reward_model, strict=False) if args.device_map is not None and args.bits not in [4, 8]: reward_model = reward_model.half() print_rank_0(f"Finished loading model and tokenizer") # Set up the datasets if args.do_train: train_dataset = PairwiseDataset(args, os.path.join(args.data_dir, args.train_filename), tokenizer) else: train_dataset = None if args.do_eval: val_dataset = PairwiseDataset(args, os.path.join(args.data_dir, args.eval_filename), tokenizer) else: val_dataset = None if args.do_pred: test_dataset = SFTDataset(args, os.path.join(args.data_dir, args.test_filename), tokenizer, concat_samples=False) else: test_dataset = None if args.do_train or args.do_eval: # training arguments deepspeed_config = os.path.join(RESOURCE_PATH, "config", "deepspeed", args.deepspeed_config) if args.deepspeed_config is not None else None if torch.cuda.is_available(): bf16 = torch.cuda.get_device_capability()[0] >= 8 fp16 = False if bf16 else True else: fp16 = False bf16 = False training_args = TrainingArguments( output_dir=args.output_dir, no_cuda=not torch.cuda.is_available(), seed=args.seed, data_seed=args.seed, local_rank=args.local_rank, do_train=args.do_train, num_train_epochs=args.num_epochs, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, per_device_train_batch_size=args.train_batch_size, max_grad_norm=args.max_grad_norm, gradient_accumulation_steps=args.gradient_accumulation_steps, warmup_ratio=args.warmup_ratio, weight_decay=args.weight_decay, half_precision_backend="auto", fp16=fp16, bf16=bf16, save_strategy=args.save_strategy, save_steps=args.save_steps, save_total_limit=args.save_total_limit, metric_for_best_model=args.metric_for_best_model, greater_is_better=True, logging_steps=args.logging_steps, report_to=["tensorboard"], deepspeed=deepspeed_config, gradient_checkpointing=args.gradient_checkpointing, do_eval=args.do_eval, evaluation_strategy=args.evaluation_strategy, eval_steps=args.eval_steps, eval_accumulation_steps=args.eval_accumulation_steps, per_device_eval_batch_size=args.eval_batch_size, label_names=["labels"], # do_predict=args.do_pred, # use_legacy_prediction_loop=args.do_pred, ) print_rank_0(f"Training Arguments: {training_args}") def compute_metrics(eval_preds): chosen_end_scores = eval_preds.predictions[1] # chosen scores rejected_end_scores = eval_preds.predictions[3] # rejected scores result = {} acc = sum(chosen_end_scores > rejected_end_scores) / len(rejected_end_scores) result["accuracy"] = acc return result # Prepare the trainer and start training trainer = Trainer( model=reward_model, args=training_args, train_dataset=train_dataset, compute_metrics=compute_metrics, eval_dataset=val_dataset, ) if args.do_train: trainer.train() trainer.save_model(args.output_dir) elif args.do_eval: eval_result = trainer.evaluate(eval_dataset=val_dataset) print_rank_0(eval_result) if args.do_pred: reward_model.eval() device = f"cuda:{args.local_rank}" if torch.cuda.is_available() and args.device_map is not None else "cpu" sampler = SequentialSampler(test_dataset) test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, sampler=sampler) rewards = [] with torch.no_grad(): for batch in tqdm(test_loader, desc="Prediction"): chosen_input_ids = batch['input_ids'].to(device) chosen_attention_mask = batch['attention_mask'].to(device) if 'attention_mask' in batch else None chosen_position_ids = batch['position_ids'].to(device) if 'position_ids' in batch else None output = reward_model(chosen_input_ids, chosen_attention_mask, chosen_position_ids) rewards.extend(output['chosen_reward'].cpu().detach().tolist()) # save result into file with open(os.path.join(args.output_dir, args.output_filename), "w", encoding="utf-8") as w: w.write("\t".join(("prompt", "answer", "score"))+"\n") for item, reward in zip(test_dataset.post_list, rewards): w.write("\t".join((item["prompt"], item["label"], str(reward))) + "\n") print_rank_0(f"Finished prediction and saving into {args.output_filename}") if __name__ == "__main__": main() ================================================ FILE: src/train_rlhf.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import argparse import torch import random import copy import deepspeed import numpy as np from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, default_data_collator from torch.utils.data import RandomSampler, DistributedSampler, DataLoader from src.models.rlhf_engine import DeepSpeedRLHFEngine from src.models.trainer import DeepSpeedPPOTrainer, DeepSpeedPPOPTXTrainer from src.utils.file_utils import set_seed, print_rank_0, print_gpu_utilization_torch, print_gpu_utilization from src.data.data import SFTDataset, RLHFDataset, PPODataset from src.utils.modeling_utils import get_all_reduce_mean, save_hf_format, moving_average, save_zero_three_model def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--tokenizer_path", type=str, required=True) parser.add_argument("--actor_model_path", type=str, required=True) parser.add_argument("--critic_model_path", type=str, required=True) parser.add_argument("--critic_checkpoint", type=str, required=True) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--max_length", type=int, default=512, help="total max sequence length = max prompt length + mas generation/answer length") parser.add_argument("--max_gen_length", type=int, default=256, help="max generation/answer length") # train parser.add_argument("--do_train", action="store_true") parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--pretrain_filename", type=str, default=None, help="pretraining dataset (for PPO-ptx)") parser.add_argument("--num_epochs", type=int, default=1, help="Total number of training epochs to perform.") parser.add_argument("--ppo_epochs", type=int, default=1, help="Number of epochs to perform ppo training for each experience") parser.add_argument("--actor_learning_rate", type=float, default=1e-6) parser.add_argument("--critic_learning_rate", type=float, default=1e-6) parser.add_argument("--lr_scheduler_type", type=str, default="linear", help="transformers.trainer_utils.SchedulerType, including:" "linear, cosine, cosine_with_restarts, polynomial, constant," "constant_with_warmup") parser.add_argument("--train_batch_size", type=int, default=4) parser.add_argument("--ppo_train_batch_size", type=int, default=4, help="PPO training mini batch size (per device)") parser.add_argument("--ppo_batch_numbers", type=int, default=1, help="number of batches for PPO training") parser.add_argument("--actor_weight_decay", type=float, default=0.1) parser.add_argument("--critic_weight_decay", type=float, default=0.1) parser.add_argument("--warmup_ratio", type=int, default=0.1) parser.add_argument("--logging_steps", type=int, default=100) parser.add_argument('--disable_actor_dropout', action='store_true', help='Disable the dropout of the actor model.') parser.add_argument('--disable_critic_dropout', action='store_true', help='Disable the dropout of the critic model.') parser.add_argument("--pretrain_coef", type=float, default=10.0, help="coefficient of pretraining loss in ppo-ptx objective function") parser.add_argument("--save_total_limit", type=int, default=2) parser.add_argument("--gradient_accumulation_steps", type=int, default=8) parser.add_argument("--do_sample", action="store_true") parser.add_argument("--num_return_sequences", type=int, default=1) parser.add_argument("--top_k", type=int, default=50) parser.add_argument("--top_p", type=float, default=0.8) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--kl_coefficient", type=float, default=0.02, help="Coefficient of KL divergence, used in computing modified reward") parser.add_argument("--clip_reward_value", type=float, default=5.0) parser.add_argument("--clip_range", type=float, default=0.2, help="Clip range of policy, used in computing clipped policy loss") parser.add_argument("--clip_range_value", type=float, default=0.2, help="Clip range of value function, used in computing clipped value function loss") parser.add_argument("--gamma", type=float, default=1.0, help="first decaying factor, used in computing advantages") parser.add_argument("--lambda_", type=float, default=0.95, help="second decaying factor, used in computing advantages") parser.add_argument('--enable_joint_optimization', action='store_true', help="Enable joint optimization of policy and reward") # deepspeed parser.add_argument('--enable_hybrid_engine', action='store_true', help="Enable hybrid engine for actor model to optimize both inference and training through DeepSpeed.") parser.add_argument('--actor_zero_stage', type=int, default=0, help='ZeRO optimization stage for Actor model (and clones).') parser.add_argument('--critic_zero_stage', type=int, default=0, help='ZeRO optimization stage for Critic model (and reward).') parser.add_argument('--offload', action='store_true', help='Enable ZeRO Offload techniques.') parser.add_argument('--offload_reference_model', action='store_true', help='Enable ZeRO Offload techniques for reference model') parser.add_argument("--actor_gradient_checkpointing", action="store_true", help="whether to use gradient checkpointing for actor model") parser.add_argument("--critic_gradient_checkpointing", action="store_true", help="whether to use gradient checkpointing for critic model") parser.add_argument("--unpin_actor_parameters", action='store_true', help="Unpin actor's parameters during generation. This makes generation slower but requires less memory.") parser.add_argument("--release_inference_cache", action='store_true', help="Release the memory cache used for inference. This makes generation preparation slower but might increase e2e throughput by using larger batch size.") parser.add_argument("--inference_tp_size", type=int, default=1, help="Tensor-parallelism degree used for the inference-optimization. Please note hybrid-engine need to be enabled when using this feature.") parser.add_argument("--tp_gather_partition_size", type=int, default=8, help="Granularity to bring in layers for TP sharding inside the hybrid engine. Please note hybrid-engine and tp_inference_size > 1 need to be true when using this feature.") # parser.add_argument("--num_layers_unfrozen", type=int, default=-1, help="Number of layers to unfreeze for fine-tuning") parser.add_argument('--enable_ema', action='store_true', help='Enable EMA checkpoint for the model.') # lora parser.add_argument("--actor_lora_rank", type=int, default=0) parser.add_argument("--actor_lora_alpha", type=int, default=16) parser.add_argument("--critic_lora_rank", type=int, default=0) parser.add_argument("--critic_lora_alpha", type=int, default=16) parser.add_argument("--lora_alpha", type=int, default=1) parser.add_argument("--lora_train_bias", type=str, default="none") # eval parser.add_argument("--do_eval", action="store_true") parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--evaluation_strategy", type=str, default="epoch", help='- `"no"`: No evaluation is done during training.' '- `"steps"`: Evaluation is done (and logged) every `eval_steps`.' '- `"epoch"`: Evaluation is done at the end of each epoch.') parser.add_argument("--eval_steps", type=int, default=None) parser.add_argument("--eval_accumulation_steps", type=int, default=1) # pred parser.add_argument("--do_pred", action="store_true") parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--test_filename", type=str, default=None) parser.add_argument("--output_filename", type=str, default=None) args = parser.parse_args() return args def create_datasets(args, tokenizer_padding_from_left, ppo_ptx_enabled, tokenizer_padding_from_right): train_dataset = RLHFDataset(args, os.path.join(args.data_dir, args.train_filename), tokenizer_padding_from_left) iters_prompt = len(train_dataset) // args.train_batch_size if ppo_ptx_enabled: pretrain_dataset = SFTDataset(args, os.path.join(args.data_dir, args.pretrain_filename), tokenizer_padding_from_right, concat_samples=False) iters_pretrain = len(pretrain_dataset) // args.train_batch_size else: pretrain_dataset = None iters_pretrain = np.inf num_update_steps_per_epoch = min(iters_prompt, iters_pretrain) * \ (args.train_batch_size / args.ppo_train_batch_size) * \ args.ppo_epochs / args.gradient_accumulation_steps num_total_iters = int(args.num_epochs * num_update_steps_per_epoch) return train_dataset, pretrain_dataset, num_total_iters def create_dataloader(args, train_dataset, pretrain_dataset=None): # DataLoaders creation: # data_collator = DataCollatorRLHF(args.max_length, pad_token_id) if args.local_rank == -1: prompt_train_sampler = RandomSampler(train_dataset) if pretrain_dataset is not None: pretrain_sampler = RandomSampler(pretrain_dataset) else: prompt_train_sampler = DistributedSampler(train_dataset) if pretrain_dataset is not None: pretrain_sampler = DistributedSampler(pretrain_dataset) # prompt_train_dataloader is only used for generate_experience() where batch size is fixed at 1 prompt_train_dataloader = DataLoader( train_dataset, # collate_fn=data_collator, sampler=prompt_train_sampler, batch_size=1) # batch_size=args.train_batch_size) if pretrain_dataset is not None: pretrain_dataloader = DataLoader( pretrain_dataset, # collate_fn=default_data_collator, sampler=pretrain_sampler, batch_size=args.train_batch_size) else: pretrain_dataloader = [None] * len( prompt_train_dataloader) # num_update_steps_per_epoch = min(len(prompt_train_dataloader), len(pretrain_dataloader)) * \ # (args.train_batch_size / args.ppo_train_batch_size) * \ # args.ppo_epochs / args.gradient_accumulation_steps # num_total_iters = int(args.num_epochs * num_update_steps_per_epoch) return prompt_train_dataloader, pretrain_dataloader#, num_total_iters def main(): args = get_parser() assert not args.enable_hybrid_engine, "DeepSpeed currently does not support Pangu-based or GLM-based model in hybrid engine" if args.local_rank == -1: device = torch.device("cuda") else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # deepspeed.init_distributed() # args.global_rank = torch.distributed.get_rank() print_rank_0(f"Parameters: {args}") set_seed(args.seed) # torch.distributed.barrier() # Set PPO-ptx ppo_ptx_enabled = args.pretrain_filename is not None if ppo_ptx_enabled: args.gradient_accumulation_steps_actor = args.gradient_accumulation_steps * 2 else: args.gradient_accumulation_steps_actor = args.gradient_accumulation_steps # n_gpus = torch.distributed.get_world_size() n_gpus = torch.cuda.device_count() args.global_train_batch_size_actor = args.ppo_train_batch_size * args.gradient_accumulation_steps_actor * n_gpus args.global_train_batch_size_critic = args.ppo_train_batch_size * args.gradient_accumulation_steps * n_gpus # load tokenizer tokenizer_padding_from_right = AutoTokenizer.from_pretrained(args.tokenizer_path, use_cache=False, trust_remote_code=True) if "chatglm2" in args.tokenizer_path: tokenizer_padding_from_right.eop_token_id = tokenizer_padding_from_right.get_command("eop") tokenizer_padding_from_left = copy.deepcopy(tokenizer_padding_from_right) # tokenizer.pad_token = tokenizer.eos_token tokenizer_padding_from_left.padding_side = "left" # PS: padding side slightly affect output of sft generation and reward model result args.max_prompt_length = args.max_length - args.max_gen_length print_gpu_utilization("before create deepspeed rlhf engine", args.local_rank) print_gpu_utilization_torch("before create deepspeed rlhf engine", args.local_rank) if args.do_train: # load data and create dataset prompt_train_dataset, pretrain_dataset, num_total_iters = create_datasets(args, tokenizer_padding_from_left, ppo_ptx_enabled, tokenizer_padding_from_right) args.warmup_steps = int(num_total_iters * args.warmup_ratio) # load rlhf engine rlhf_engine = DeepSpeedRLHFEngine( actor_model_name_or_path=args.actor_model_path, critic_model_name_or_path=args.critic_model_path, tokenizer=tokenizer_padding_from_right, num_total_iters=num_total_iters, args=args) # create dataloader [need to be called after rlhf engine initialization because # DistributedSampler can only be called after deepspeed.initialize() is called] prompt_train_dataloader, pretrain_dataloader = create_dataloader(args, prompt_train_dataset, pretrain_dataset) # create deepspeed ppo trainer print_gpu_utilization("after create deepspeed rlhf engine", args.local_rank) print_gpu_utilization_torch("after create deepspeed rlhf engine", args.local_rank) ppo_trainer = DeepSpeedPPOPTXTrainer if ppo_ptx_enabled else DeepSpeedPPOTrainer trainer = ppo_trainer(rlhf_engine, args) print_gpu_utilization("after create deepspeed trainer", args.local_rank) print_gpu_utilization_torch("after create deepspeed trainer", args.local_rank) # create ppo experience dataset exp_mini_dataset = PPODataset(args.ppo_batch_numbers, args.ppo_train_batch_size) pretrain_mini_dataset = PPODataset(args.ppo_batch_numbers, args.ppo_train_batch_size) print_rank_0("Start training") for epoch in range(args.num_epochs): print_rank_0(f"Beginning of Epoch {epoch+1}/{args.num_epochs}, " f"Total Generation Batches {min(len(prompt_train_dataloader), len(pretrain_dataloader))}") prompt_iter = iter(prompt_train_dataloader) pretrain_iter = iter(pretrain_dataloader) step = 0 # for step, (batch_prompt, batch_pretrain) in enumerate(zip(prompt_train_dataloader, pretrain_dataloader)): while True: # generate sequence: generate only one sequence at a time, aggregate to form a batch answer_start_indices = [] output_sequences = dict() for _ in range(args.train_batch_size): try: batch_prompt = next(prompt_iter) batch_prompt = {k: v.to(device) for k, v in batch_prompt.items()} outputs, prompt_length = trainer.generate_sequence(batch_prompt) answer_start_indices.append(prompt_length-1) for key, val in outputs.items(): if key not in output_sequences: output_sequences[key] = [] output_sequences[key].append(val[0]) except StopIteration: break if len(output_sequences) > 0: output_sequences = {key: torch.stack(val).to(device) for key, val in output_sequences.items()} output_experiences = trainer.generate_experience(output_sequences, answer_start_indices) exp_dataset = exp_mini_dataset.add(output_experiences) else: exp_dataset = None try: batch_pretrain = next(pretrain_iter) if batch_pretrain is not None: batch_pretrain = {k: v.to(device) for k, v in batch_pretrain.items()} pretrain_dataset = pretrain_mini_dataset.add(batch_pretrain) else: pretrain_dataset = pretrain_mini_dataset.add([[None] * args.train_batch_size]) except StopIteration: pass if exp_dataset is not None: inner_iter = 0 critic_loss, actor_loss, pretrain_loss = 0, 0, 0 average_reward = 0 if args.actor_gradient_checkpointing: rlhf_engine.actor.gradient_checkpointing_enable() for ppo_ep in range(args.ppo_epochs): for i, (exp_data, pretrain_data) in enumerate( zip(exp_dataset, pretrain_dataset)): actor_loss, critic_loss = trainer.train_rlhf(exp_data) critic_loss += actor_loss.item() actor_loss += critic_loss.item() average_reward += exp_data["rewards"].mean() if ppo_ptx_enabled: pretrain_loss = trainer.train_unsupervised(pretrain_data, args.pretrain_coef) pretrain_loss += pretrain_loss.item() inner_iter += 1 if args.enable_ema: moving_average(rlhf_engine.actor, rlhf_engine.actor_ema, zero_stage=args.actor_zero_stage) random.shuffle(exp_dataset) random.shuffle(pretrain_dataset) print_rank_0(f'epoch: {epoch}, step: {step}, ppo_ep: {ppo_ep+1}, act_loss: {actor_loss/inner_iter},' f'cri_loss: {critic_loss/inner_iter}, pretrain_loss: {pretrain_loss/inner_iter}') average_reward = get_all_reduce_mean(average_reward).item() print_rank_0(f"average reward score: {average_reward/inner_iter}") if args.actor_gradient_checkpointing: rlhf_engine.actor.gradient_checkpointing_disable() step += 1 print_rank_0('saving model ...') # if args.actor_lora_rank > 0: # rlhf_engine.actor = convert_lora_to_linear_layer(rlhf_engine.actor) # if args.enable_ema: # rlhf_engine.actor_ema = convert_lora_to_linear_layer(rlhf_engine.actor_ema) # if args.critic_lora_rank > 0: # rlhf_engine.critic = convert_lora_to_linear_layer(rlhf_engine.critic) if args.local_rank == 0: save_hf_format(rlhf_engine.actor, tokenizer_padding_from_right, args, sub_folder='actor') save_hf_format(rlhf_engine.critic, tokenizer_padding_from_right, args, sub_folder='critic') if args.enable_ema: save_hf_format(rlhf_engine.actor_ema, tokenizer_padding_from_right, args, sub_folder='actor_ema') if args.actor_zero_stage == 3: save_zero_three_model(rlhf_engine.actor, global_rank=args.local_rank, save_dir=os.path.join(args.output_dir, 'actor'), zero_stage=args.actor_zero_stage) if args.enable_ema: save_zero_three_model(rlhf_engine.actor_ema, global_rank=args.local_rank, save_dir=os.path.join(args.output_dir, 'actor_ema'), zero_stage=args.actor_zero_stage) if args.critic_zero_stage == 3: save_zero_three_model(rlhf_engine.critic, global_rank=args.local_rank, save_dir=os.path.join(args.output_dir, 'critic'), zero_stage=args.critic_zero_stage) if __name__ == "__main__": main() ================================================ FILE: src/train_rlhf_trlx.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import argparse import torch import glob from typing import Callable, Dict, Iterable, List, Optional, Tuple, Any from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM from src.utils import logger, RESOURCE_PATH from src.utils.config import TRLConfig, default_ilql_config, default_ppo_config, default_sft_config from src.models.reward import RewardModel from src.utils.file_utils import set_seed from src.data.data import RLHFDataset from src.utils.loading import get_pipeline, get_trainer def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--tokenizer_path", type=str, required=True) parser.add_argument("--sft_model_path", type=str, required=True) parser.add_argument("--reward_model_path", type=str, required=True) parser.add_argument("--reward_checkpoint", type=str, required=True) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--max_length", type=int, default=1024) # train parser.add_argument("--do_train", action="store_true") parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--num_epochs", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-6) parser.add_argument("--lr_scheduler_type", type=str, default="linear", help="transformers.trainer_utils.SchedulerType, including:" "linear, cosine, cosine_with_restarts, polynomial, constant," "constant_with_warmup") parser.add_argument("--train_batch_size", type=int, default=4) parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--warmup_ratio", type=int, default=0.1) parser.add_argument("--logging_steps", type=int, default=100) parser.add_argument("--save_strategy", type=str, default="epoch", help='- `"no"`: No save is done during training.' '- `"epoch"`: Save is done at the end of each epoch.' '- `"steps"`: Save is done every `save_steps`.') parser.add_argument("--save_steps", type=int, default=None) parser.add_argument("--save_total_limit", type=int, default=2) parser.add_argument("--gradient_accumulation_steps", type=int, default=8) parser.add_argument("--gradient_checkpointing", action="store_true", help="If True, use gradient checkpointing to save memory at the expense of slower backward pass.") parser.add_argument("--deepspeed_config", type=str, default=None) parser.add_argument("--ppo_config", type=str, default=None) parser.add_argument("--num_layers_unfrozen", type=int, default=-1, help="Number of layers to unfreeze for fine-tuning") parser.add_argument("--lora_rank", type=int, default=0) parser.add_argument("--lora_alpha", type=int, default=1) parser.add_argument("--lora_train_bias", type=str, default="none") # eval parser.add_argument("--do_eval", action="store_true") parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--evaluation_strategy", type=str, default="epoch", help='- `"no"`: No evaluation is done during training.' '- `"steps"`: Evaluation is done (and logged) every `eval_steps`.' '- `"epoch"`: Evaluation is done at the end of each epoch.') parser.add_argument("--eval_steps", type=int, default=None) parser.add_argument("--eval_accumulation_steps", type=int, default=1) # pred parser.add_argument("--do_pred", action="store_true") parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--test_filename", type=str, default=None) parser.add_argument("--output_filename", type=str, default=None) args = parser.parse_args() return args def train(model_path: Optional[str] = None, reward_fn: Optional[Callable[[List[Any], Any], torch.Tensor]] = None, dataset: Optional[Iterable[Tuple[str, float]]] = None, samples: Optional[List[str]] = None, rewards: Optional[List[float]] = None, prompts: Optional[List[str]] = None, eval_prompts: Optional[List[str]] = None, metric_fn: Optional[Callable[[List[str], List[str], List[str]], Dict[str, List[float]]]] = None, config: Optional[TRLConfig] = None, stop_sequences: Optional[List[str]] = [],): if config is None: logger.warn( "Passing the `config` argument implicitly is depreciated, use or" "adapt some from default configs instead" ) if reward_fn: config = default_ppo_config() elif rewards: config = default_ilql_config() else: config = default_sft_config() set_seed(config.train.seed) if dataset: logger.warn("the `dataset` argument is being depreciated, split it into `samples` and `rewards` instead") samples, rewards = dataset if model_path: config.model.model_path = model_path trainer = get_trainer(config.train.trainer)( config=config, reward_fn=reward_fn, metric_fn=metric_fn, stop_sequences=stop_sequences, **config.train.trainer_kwargs, ) batch_size = config.train.batch_size * int(os.environ.get("WORLD_SIZE", 1)) max_prompt_length = config.train.seq_length - config.method.gen_kwargs["max_new_tokens"] # Online training against a reward function (e.g. PPO) if reward_fn: prompts = prompts or [trainer.tokenizer.bos_token] * batch_size if eval_prompts is None: eval_prompts = prompts[:batch_size] pipeline = get_pipeline(config.train.pipeline)(prompts, config, trainer.tokenizer) trainer.add_prompt_pipeline(pipeline) if eval_prompts is None: eval_prompts = prompts[:batch_size] trainer.make_experience(config.method.num_rollouts) # Offline training from the collected samples (e.g. SFT, ILQL) elif samples: if rewards: if len(samples) != len(rewards): raise ValueError(f"Number of samples {len(samples)} should match the number of rewards {len(rewards)}") if eval_prompts is None: eval_prompts = [trainer.tokenizer.bos_token] * batch_size if rewards: trainer.make_experience(samples, rewards, config.train.seq_length) else: trainer.store = get_pipeline(config.train.pipeline)(samples, config, trainer.tokenizer) else: raise ValueError("Either `samples` or `reward_fn` should be given for training") eval_pipeline = get_pipeline(config.train.pipeline)(eval_prompts, config, trainer.tokenizer) trainer.add_eval_pipeline(eval_pipeline) trainer.learn() def main(): args = get_parser() logger.info(f"Parameters: {args}") set_seed(args.seed) # load tokenizer tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, use_cache=False, trust_remote_code=True) # tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" # PS: padding side does affect output of reward model # load reward model if "pangu" in args.reward_model_path: model = AutoModelForCausalLM.from_pretrained(args.reward_model_path, use_cache=False, trust_remote_code=True) model.resize_token_embeddings(tokenizer.vocab_size) # model.config.end_token_id = tokenizer.eos_token_id # model.config.pad_token_id = tokenizer.pad_token_id # model.config.bos_token_id = tokenizer.bos_token_id # model.config.eos_token_id = tokenizer.eos_token_id model.config.lora_rank = args.lora_rank model.config.lora_alpha = args.lora_alpha model.config.lora_train_bias = args.lora_train_bias model.config.target_modules = "q_proj,k_proj,v_proj" model.config.task_type = "CAUSAL_LM" # Initialize the reward model from the (supervised) fine-tuned SFT model reward_model = RewardModel(model.config, model.transformer, tokenizer) # reward_model = RewardModelWithLoRA(model.config, model.transformer, tokenizer) elif "chatglm" in args.reward_model_path: model = AutoModelForSeq2SeqLM.from_pretrained(args.reward_model_path, trust_remote_code=True).half() model.config.lora_rank = args.lora_rank model.config.lora_alpha = args.lora_alpha model.config.lora_train_bias = args.lora_train_bias model.config.target_modules = "query_key_value" model.config.task_type = "SEQ_2_SEQ_LM" # Initialize the reward model from the (supervised) fine-tuned SFT model reward_model = RewardModel(model.config, model, tokenizer) # reward_model = RewardModelWithLoRA(model.config, model.glm, tokenizer) elif "glm" in args.reward_model_path: model = AutoModelForSeq2SeqLM.from_pretrained(args.reward_model_path, trust_remote_code=True) model.config.lora_rank = args.lora_rank model.config.lora_alpha = args.lora_alpha model.config.lora_train_bias = args.lora_train_bias model.config.target_modules = "query_key_value" model.config.task_type = "SEQ_2_SEQ_LM" # Initialize the reward model from the (supervised) fine-tuned SFT model reward_model = RewardModel(model.config, model.glm, tokenizer) # reward_model = RewardModelWithLoRA(model.config, model.glm, tokenizer) else: raise ValueError(f"Unsupported model name: {args.reward_model_path}") assert model.config.pad_token_id == tokenizer.pad_token_id if args.reward_checkpoint is not None: checkpoints = glob.glob(args.reward_checkpoint.replace("star", "*")) st = dict() for checkpoint in checkpoints: st.update(torch.load(checkpoint, map_location="cpu")) res = reward_model.load_state_dict(st, strict=False) device = torch.device(f"cuda:{args.local_rank}") if torch.cuda.is_available() else torch.device("cpu") # reward_model.half() reward_model.eval() reward_model.to(device) logger.info(f"Finish loading reward model from {args.reward_checkpoint}") def reward_fn(samples, **kwargs): scores_list = [] for i in range(0, len(samples), ppo_config.train.batch_size): input_ids_list = [] attention_mask_list = [] position_ids_list = [] for sample in samples[i: i + ppo_config.train.batch_size]: prompt, pred = sample.split(tokenizer.sep_token, maxsplit=1) logger.debug(f"prompt: {prompt}, pred: {pred}") if "pangu" in ppo_config.model.model_path: encodings_dict = tokenizer(prompt, pred, max_length=ppo_config.train.seq_length, truncation="longest_first", padding="max_length", return_tensors="pt", return_token_type_ids=False) input_ids_list.append(encodings_dict["input_ids"]) attention_mask_list.append(encodings_dict["attention_mask"]) elif "chatglm" in ppo_config.model.model_path: encoded_dict = tokenizer(prompt, pred, max_length=ppo_config.train.seq_length, return_tensors="pt", truncation="longest_first", padding="max_length") input_ids_list.append(encoded_dict["input_ids"][0]) elif "glm" in ppo_config.model.model_path: # TODO: to be modified for and tested against glm encoded_prompt = tokenizer(prompt, tokenizer.mask_token) prompt_length = len(encoded_prompt['input_ids']) label_length = len(tokenizer.tokenize(pred)) if prompt_length + label_length > ppo_config.train.seq_length: num_tokens_to_remove = prompt_length + label_length - ppo_config.train.seq_length for _ in range(num_tokens_to_remove): if prompt_length > label_length: prompt_length -= 1 else: label_length -= 1 else: label_length = ppo_config.train.seq_length - prompt_length assert prompt_length > 0 assert label_length > 0 assert prompt_length + label_length <= ppo_config.train.seq_length encoded_dict = tokenizer(prompt, tokenizer.mask_token, max_length=prompt_length, truncation="only_first", return_tensors="pt", return_attention_mask=True, return_token_type_ids=False) encoded_dict = tokenizer.build_inputs_for_generation(encoded_dict, targets=pred, max_gen_length=label_length, padding=True) input_ids_list.append(encoded_dict["input_ids"][0]) attention_mask_list.append(encoded_dict["attention_mask"][0]) position_ids_list.append(encoded_dict["position_ids"][0]) else: raise ValueError(f"Unsupported model type: {ppo_config.model.model_path}") # encodings_dict = tokenizer( # sub_samples, # max_length=ppo_config.train.seq_length, # truncation="longest_first", # padding="max_length", # return_tensors="pt", # ) input_ids = torch.stack(input_ids_list, dim=0).to(device) attention_mask = torch.stack(attention_mask_list, dim=0).to(device) if len(attention_mask_list) > 0 else None position_ids = torch.stack(position_ids_list, dim=0).to(device) if len(position_ids_list) > 0 else None with torch.no_grad(): sub_scores = reward_model(input_ids, attention_mask, position_ids) scores_list.append(sub_scores["chosen_reward"]) scores = torch.cat(scores_list, dim=0) return scores # load ppo config ppo_config = TRLConfig.load_yaml(os.path.join(RESOURCE_PATH, "config", "ppo_model", args.ppo_config)) ppo_config.train.epochs = args.num_epochs ppo_config.train.seq_length = args.max_length ppo_config.train.batch_size = args.train_batch_size ppo_config.train.checkpoint_dir = args.output_dir ppo_config.train.checkpoint_interval = args.save_steps ppo_config.train.eval_interval = args.eval_steps ppo_config.model.num_layers_unfrozen = args.num_layers_unfrozen ppo_config.model.model_path = args.sft_model_path ppo_config.tokenizer.tokenizer_path = args.tokenizer_path ppo_config.optimizer.kwargs['lr'] = args.learning_rate ppo_config.optimizer.kwargs['weight_decay'] = args.weight_decay ppo_config.method.chunk_size = args.eval_batch_size ppo_config.train.lora_rank = args.lora_rank ppo_config.train.lora_alpha = args.lora_alpha ppo_config.train.lora_train_bias = args.lora_train_bias logger.info(f"PPO config: {ppo_config}") # load dataset if args.do_train: train_dataset = RLHFDataset.load_dataset(os.path.join(args.data_dir, args.train_filename)) else: train_dataset = None if args.do_eval: dev_dataset = RLHFDataset.load_dataset(os.path.join(args.data_dir, args.eval_filename)) else: dev_dataset = None if args.do_train: train(model_path=args.sft_model_path, reward_fn=reward_fn, prompts=train_dataset, eval_prompts=dev_dataset, config=ppo_config) if __name__ == "__main__": main() ================================================ FILE: src/train_sft.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import argparse import evaluate import torch from tqdm import tqdm from transformers import ( Trainer, TrainingArguments, default_data_collator, ) from src.utils import RESOURCE_PATH, load_tokenizer_and_model, load_checkpoint from src.data.data import SFTDataset, chatglm2_encode, chatglm3_encode from src.utils.file_utils import set_seed, print_rank_0 # from src.models import convert_to_lora_recursively # Create a preprocessing function to extract out the proper logits from the model output def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): logits = logits[0] return logits.argmax(dim=-1) def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--tokenizer_path", type=str, required=True) parser.add_argument("--model_name_or_path", type=str, required=True) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--max_length", type=int, default=1024) parser.add_argument("--max_length_generation", type=int, default=None) parser.add_argument("--bits", type=int, default=32, help="bits used to load model, including: 32, 16, 8, 4") parser.add_argument("--device_map", type=str, default=None, help="device map to allocate model," "[None] means cpu" "[0, 1, 2, ...], number means single-card" "[auto, balanced, balanced_low_0] means multi-card") parser.add_argument("--low_cpu_mem_usage", action="store_true", help="whether to enable low cpu memory usage" "when loading model") # train parser.add_argument("--do_train", action="store_true") parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--concat_samples", action="store_true") parser.add_argument("--num_epochs", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-6) parser.add_argument("--lr_scheduler_type", type=str, default="cosine", help="transformers.trainer_utils.SchedulerType, including:" "linear, cosine, cosine_with_restarts, polynomial, constant," "constant_with_warmup") parser.add_argument("--train_batch_size", type=int, default=4) parser.add_argument("--weight_decay", type=float, default=0.1) parser.add_argument("--warmup_ratio", type=int, default=0.1) parser.add_argument("--logging_steps", type=int, default=100) parser.add_argument("--save_strategy", type=str, default="steps", help='- `"no"`: No save is done during training.' '- `"epoch"`: Save is done at the end of each epoch.' '- `"steps"`: Save is done every `save_steps`.') parser.add_argument("--save_steps", type=int, default=1000) parser.add_argument("--save_total_limit", type=int, default=2) parser.add_argument("--metric_for_best_model", type=str, default=None) parser.add_argument("--gradient_accumulation_steps", type=int, default=8) parser.add_argument("--gradient_checkpointing", action="store_true", help="If True, use gradient checkpointing to save memory at the expense of slower backward pass.") parser.add_argument("--deepspeed_config", type=str, default=None) parser.add_argument("--lora_rank", type=int, default=0) parser.add_argument("--lora_alpha", type=int, default=1) parser.add_argument("--lora_train_bias", type=str, default="none") # eval parser.add_argument("--do_eval", action="store_true") parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--evaluation_strategy", type=str, default="steps", help='- `"no"`: No evaluation is done during training.' '- `"steps"`: Evaluation is done (and logged) every `eval_steps`.' '- `"epoch"`: Evaluation is done at the end of each epoch.') parser.add_argument("--eval_steps", type=int, default=100) parser.add_argument("--eval_accumulation_steps", type=int, default=1) # pred parser.add_argument("--do_pred", action="store_true") parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--test_filename", type=str, default=None) parser.add_argument("--output_filename", type=str, default=None) parser.add_argument("--do_sample", action="store_true") parser.add_argument("--num_return_sequences", type=int, default=1) parser.add_argument("--top_k", type=int, default=None) parser.add_argument("--top_p", type=float, default=None) parser.add_argument("--temperature", type=float, default=None) args = parser.parse_args() return args def main(): args = get_parser() print_rank_0(f"Parameters: {args}") set_seed(args.seed) # load tokenizer and model tokenizer, model, eos_token_id = load_tokenizer_and_model(args) if args.checkpoint is not None: load_checkpoint(args, model, strict=False) print_rank_0(f"Finished loading model and tokenizer") # Set up the datasets if args.do_train: train_dataset = SFTDataset(args, os.path.join(args.data_dir, args.train_filename), tokenizer, concat_samples=args.concat_samples) else: train_dataset = None if args.do_eval: dev_dataset = SFTDataset(args, os.path.join(args.data_dir, args.eval_filename), tokenizer, concat_samples=False) else: dev_dataset = None if args.do_pred: test_dataset = SFTDataset(args, os.path.join(args.data_dir, args.test_filename), tokenizer, concat_samples=False) else: test_dataset = None if args.do_train: if torch.cuda.is_available(): bf16 = torch.cuda.get_device_capability()[0] >= 8 fp16 = not bf16 else: fp16 = False bf16 = False # training arguments deepspeed_config = os.path.join(RESOURCE_PATH, "config", "deepspeed", args.deepspeed_config) if args.deepspeed_config is not None else None training_args = TrainingArguments( output_dir=args.output_dir, no_cuda=not torch.cuda.is_available(), seed=args.seed, data_seed=args.seed, local_rank=args.local_rank, do_train=args.do_train, num_train_epochs=args.num_epochs, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, per_device_train_batch_size=args.train_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, warmup_ratio=args.warmup_ratio, weight_decay=args.weight_decay, half_precision_backend="auto", fp16=fp16, bf16=bf16, optim="paged_adamw_8bit", # adam_beta1=0.9, # adam_beta2=0.95, save_strategy=args.save_strategy, save_steps=args.save_steps, save_total_limit=args.save_total_limit, metric_for_best_model=args.metric_for_best_model, greater_is_better=True, logging_steps=args.logging_steps, report_to=["tensorboard"], deepspeed=deepspeed_config, gradient_checkpointing=args.gradient_checkpointing, do_eval=args.do_eval, evaluation_strategy=args.evaluation_strategy, eval_steps=args.eval_steps, eval_accumulation_steps=args.eval_accumulation_steps, per_device_eval_batch_size=args.eval_batch_size, # do_predict=args.do_pred, # use_legacy_prediction_loop=args.do_pred, ) print_rank_0(f"Training Arguments: {training_args}") # Set up the metric rouge = evaluate.load("rouge") def compute_metrics(eval_preds): labels_ids = eval_preds.label_ids pred_ids = eval_preds.predictions pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True) result = rouge.compute(predictions=pred_str, references=label_str) return result # Prepare the trainer and start training trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=dev_dataset, compute_metrics=compute_metrics, data_collator=default_data_collator, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) # model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train() trainer.save_model(args.output_dir) elif args.do_eval: # res = trainer.evaluate(eval_dataset=dev_dataset) # print_rank_0(res) pass if args.do_pred: model.eval() device = f"cuda:{args.local_rank}" if torch.cuda.is_available() and args.device_map is not None else "cpu" tokenizer.padding_side = "left" with open(os.path.join(args.output_dir, args.output_filename), "w", encoding="utf-8") as w: w.write("\t".join(["prompt"]+[f"model_answer_{i}" for i in range(args.num_return_sequences)])+"\n") for test_data in tqdm(test_dataset.post_list, desc="Prediction"): prompt = test_data['prompt'] prefix = test_data.get('prefix', "") system = test_data.get('system', "") if "chatglm3" in args.model_name_or_path.lower(): _, _, prompt_ids = chatglm3_encode(tokenizer, prompt, None, system, args.max_length) input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=device) outputs = model.generate(input_ids=input_ids, max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) prompt_length = len(prompt_ids) elif "chatglm2" in args.model_name_or_path.lower(): _, _, prompt_ids = chatglm2_encode(tokenizer, prompt, None, system, args.max_length) input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=device) outputs = model.generate(input_ids=input_ids, max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) prompt_length = len(prompt_ids) elif "chatglm" in args.model_name_or_path.lower(): prompt = "\n\n".join((system, prompt)) encoded_prompt = tokenizer(prompt) prompt_length = len(encoded_prompt['input_ids']) inputs = tokenizer(prompt, max_length=min(prompt_length, args.max_length), truncation="only_first", return_tensors="pt") # max_gen_length = args.max_length - encoded_dict['input_ids'].shape[1] # inputs = tokenizer.build_inputs_for_generation(encoded_dict, # max_gen_length=max_gen_length, padding=True) inputs = inputs.to(device) outputs = model.generate(**inputs, max_new_tokens=args.max_length_generation, eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) prompt_length = len(inputs['input_ids'][0]) elif "glm" in args.model_name_or_path.lower(): encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token) prompt_length = len(encoded_prompt['input_ids']) encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token, max_length=min(prompt_length, args.max_length), truncation="only_first", return_tensors="pt", return_token_type_ids=False) max_gen_length = args.max_length - encoded_dict['input_ids'].shape[1] inputs = tokenizer.build_inputs_for_generation(encoded_dict, max_gen_length=max_gen_length, padding=True) inputs = inputs.to(device) outputs = model.generate(**inputs, max_new_tokens=min(args.max_length_generation, max_gen_length), eos_token_id=eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) prompt_length = len(inputs['input_ids'][0]) else: inputs = tokenizer(prompt, tokenizer.sep_token + prefix, max_length=args.max_length, truncation="only_first", add_special_tokens=False, return_tensors="pt", return_token_type_ids=False) # inputs = tokenizer(prompt, add_special_tokens=False, return_token_type_ids=False, return_tensors="pt") inputs = inputs.to(device) outputs = model.generate(**inputs, max_new_tokens=args.max_length_generation, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) prompt_length = len(inputs['input_ids'][0]) results = tokenizer.batch_decode([output[prompt_length:] for output in outputs], skip_special_tokens=True) w.write("\t".join([prompt]+results)+"\n") if __name__ == "__main__": main() ================================================ FILE: src/train_sft_wo_trainer.py ================================================ import sys sys.path.insert(0, "/root/autodl-tmp/Code/RLHF") sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt") # sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt") sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt") import os import argparse import evaluate import torch import json import deepspeed from tqdm import tqdm from torch.utils.data import RandomSampler, DistributedSampler, DataLoader from transformers.deepspeed import HfDeepSpeedConfig from src.utils import logger, RESOURCE_PATH, load_tokenizer_and_model, load_checkpoint from src.data.data import SFTDataset from src.utils.file_utils import set_seed, print_rank_0 from src.utils.modeling_utils import rotate_checkpoints, save_zero_three_model # from src.models import convert_to_lora_recursively # Create a preprocessing function to extract out the proper logits from the model output def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): logits = logits[0] return logits.argmax(dim=-1) def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--model_name_or_path", type=str, required=True) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--max_length", type=int, default=1024) parser.add_argument("--bits", type=int, default=16) parser.add_argument("--device_map", type=str, default=None, help="device map to allocate model," "[None] means cpu" "[0, 1, 2, ...], number means single-card" "[auto, balanced, balanced_low_0] means multi-card") parser.add_argument("--low_cpu_mem_usage", action="store_true", help="whether to enable low cpu memory usage" "when loading model") parser.add_argument("--max_length_generation", type=int, default=None) # train parser.add_argument("--do_train", action="store_true") parser.add_argument("--train_filename", type=str, default=None) parser.add_argument("--concat_samples", action="store_true") parser.add_argument("--num_epochs", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-6) parser.add_argument("--lr_scheduler_type", type=str, default="OneCycle", help="deepspeed scheduler types, including:" "LRRangeTest, OneCycle, WarmupLR, WarmupDecayLR") parser.add_argument("--train_batch_size", type=int, default=4) parser.add_argument("--weight_decay", type=float, default=0.1) parser.add_argument("--warmup_steps", type=int, default=100) parser.add_argument("--logging_steps", type=int, default=100) parser.add_argument("--save_strategy", type=str, default="steps", help='- `"no"`: No save is done during training.' '- `"epoch"`: Save is done at the end of each epoch.' '- `"steps"`: Save is done every `save_steps`.') parser.add_argument("--save_steps", type=int, default=1000) parser.add_argument("--save_total_limit", type=int, default=2) parser.add_argument("--metric_for_best_model", type=str, default=None) parser.add_argument("--max_grad_norm", type=float, default=1.0) parser.add_argument("--gradient_accumulation_steps", type=int, default=8) parser.add_argument("--gradient_checkpointing", action="store_true", help="If True, use gradient checkpointing to save memory at the expense of slower backward pass.") parser.add_argument("--deepspeed_config", type=str, default=None) parser.add_argument("--lora_rank", type=int, default=0) parser.add_argument("--lora_alpha", type=int, default=1) parser.add_argument("--lora_train_bias", type=str, default="none") # eval parser.add_argument("--do_eval", action="store_true") parser.add_argument("--eval_filename", type=str, default=None) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--evaluation_strategy", type=str, default="steps", help='- `"no"`: No evaluation is done during training.' '- `"steps"`: Evaluation is done (and logged) every `eval_steps`.' '- `"epoch"`: Evaluation is done at the end of each epoch.') parser.add_argument("--eval_steps", type=int, default=100) parser.add_argument("--eval_accumulation_steps", type=int, default=1) # pred parser.add_argument("--do_pred", action="store_true") parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--test_filename", type=str, default=None) parser.add_argument("--output_filename", type=str, default=None) parser.add_argument("--do_sample", action="store_true") parser.add_argument("--num_return_sequences", type=int, default=1) parser.add_argument("--top_k", type=int, default=None) parser.add_argument("--top_p", type=float, default=None) parser.add_argument("--temperature", type=float, default=None) args = parser.parse_args() return args def main(): args = get_parser() print_rank_0(f"Parameters: {args}") set_seed(args.seed) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # create HfDeepSpeedConfig [must be called before instantiating model] if args.deepspeed_config is not None: ds_config_filename = os.path.join(RESOURCE_PATH, "config", "deepspeed", args.deepspeed_config) ds_config = json.load(open(ds_config_filename, "r", encoding="utf-8")) # ds_config["steps_per_print"] = args.logging_steps ds_config["train_micro_batch_size_per_gpu"] = args.train_batch_size ds_config["gradient_accumulation_steps"] = args.gradient_accumulation_steps ds_config["gradient_clipping"] = args.max_grad_norm # TODO: before calling dist init, world size is always 1, therefore ds_config['train_batch_size'] cannot multiply world size ds_config['train_batch_size'] = args.train_batch_size * args.gradient_accumulation_steps #* torch.cuda.device_count() # TODO: assuming hidden_size=4096 ds_config["zero_optimization"]["reduce_bucket_size"] = 4096 * 4096 ds_config["zero_optimization"]["stage3_prefetch_bucket_size"] = 0.9 * 4096 * 4096 ds_config["zero_optimization"]["stage3_param_persistence_threshold"] = 10 * 4096 if torch.cuda.is_available(): bf16 = torch.cuda.get_device_capability()[0] >= 8 fp16 = False if bf16 else True else: fp16 = False bf16 = False ds_config["fp16"]["enabled"] = fp16 ds_config["bf16"]["enabled"] = bf16 ds_config["optimizer"]["params"] = { "lr": args.learning_rate, "betas": [0.9, 0.95], "eps": 1e-8, "weight_decay": args.weight_decay } assert ds_config["scheduler"]['type'] == args.lr_scheduler_type ds_config["scheduler"]["params"] = { "cycle_min_lr": 0, "cycle_max_lr": args.learning_rate, "cycle_first_step_size": args.warmup_steps } dschf = HfDeepSpeedConfig(ds_config) # keep this object alive # load tokenizer and model tokenizer, model, eos_token_id = load_tokenizer_and_model(args, with_trainer=False) if args.checkpoint is not None: load_checkpoint(args, model, strict=False) print_rank_0(f"Finished loading model and tokenizer") # Set up the datasets if args.do_train: train_dataset = SFTDataset(args, os.path.join(args.data_dir, args.train_filename), tokenizer, concat_samples=args.concat_samples) else: train_dataset = None if args.do_eval: dev_dataset = SFTDataset(args, os.path.join(args.data_dir, args.eval_filename), tokenizer, concat_samples=False) else: dev_dataset = None if args.do_pred: test_dataset = SFTDataset(args, os.path.join(args.data_dir, args.test_filename), tokenizer, concat_samples=False) else: test_dataset = None if args.do_train: # # Optimizer # AdamOptimizer = DeepSpeedCPUAdam if "3" in args.deepspeed_config else FusedAdam # optim_params = get_optimizer_grouped_parameters( # actor_model, self.args.actor_weight_decay) # optim = AdamOptimizer(optim_params, # lr=self.args.actor_learning_rate, # betas=(0.9, 0.95)) # # # LR Scheduler # lr_scheduler = get_scheduler( # name=self.args.lr_scheduler_type, # optimizer=optim, # num_warmup_steps=self.args.warmup_steps, # num_training_steps=self.num_total_iters, # ) # deepspeed initialize ds_config['train_batch_size'] = args.train_batch_size * args.gradient_accumulation_steps * torch.cuda.device_count() model_engine, *_ = deepspeed.initialize(model=model, # optimizer=optim, # lr_scheduler=lr_scheduler, config=ds_config) # create data loader train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, # collate_fn=data_collator, sampler=train_sampler, batch_size=args.train_batch_size) # training model_engine.train() if args.gradient_checkpointing: model_engine.module.gradient_checkpointing_enable() global_step = 0 best_metric = None best_model_checkpoint = None if args.do_eval: assert args.eval_steps <= args.save_steps and args.save_steps % args.eval_steps == 0, \ f"save steps should be greater than eval steps and be a multiple of eval steps" eval_results = eval(global_step) print_rank_0(f"Epoch-0, Gloal step-{global_step}, Evaluation result: {eval_results}") if args.metric_for_best_model is not None: assert args.metric_for_best_model in eval_results, \ f"{args.metric_for_best_model} is not a valid metric, " \ f"please choose from the following metrics: {eval_results.keys()}" for epoch in range(args.num_epochs): print_rank_0(f"Beginning of Epoch {epoch+1}/{args.num_epochs}") for step, batch in enumerate(train_dataloader): batch = {k: v.to(device) for k, v in batch.items()} logger.debug(f"batch keys: {batch.keys()}") output = model_engine(**batch) model_engine.backward(output.loss) model_engine.step() global_step += 1 if global_step % args.logging_steps == 0: print_rank_0(f"Epoch-{epoch+1}, Gloal step-{global_step}, loss: {output.loss}") if args.do_eval and global_step % args.eval_steps == 0: eval_results = eval(global_step) print_rank_0(f"Epoch-{epoch+1}, Gloal step-{global_step}, Evaluation result: {eval_results}") if global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}") if args.do_eval and args.metric_for_best_model is not None: if ( best_metric is None or best_model_checkpoint is None or eval_results[args.metric_for_best_model] > best_metric ): best_metric = eval_results[args.metric_for_best_model] best_model_checkpoint = output_dir rotate_checkpoints(args.save_total_limit, use_mtime=True, output_dir=args.output_dir, best_model_checkpoint=best_model_checkpoint) # save_zero_three_model(model_engine, args.local_rank, # save_dir=output_dir, # zero_stage=ds_config['zero_optimization']['stage']) # model_engine.save_16bit_model(output_dir) model_engine.save_checkpoint(args.output_dir, f"checkpoint-{global_step}") print_rank_0(f"Finished saving checkpoint @Step-{global_step}") print_rank_0(f"Finished training! epochs: {epoch+1}, steps: {global_step}") output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}") # save_zero_three_model(model_engine, args.local_rank, # save_dir=output_dir, # zero_stage=ds_config['zero_optimization']['stage']) # model_engine.save_16bit_model(output_dir) model_engine.save_checkpoint(args.output_dir, f"checkpoint-{global_step}") print_rank_0(f"Finished saving checkpoint @Step-{global_step}") elif args.do_eval: pass if args.do_pred: model.eval() device = f"cuda:{args.local_rank}" if torch.cuda.is_available() else "cpu" model = model.to(device) tokenizer.padding_side = "left" with open(os.path.join(args.output_dir, args.output_filename), "w", encoding="utf-8") as w: w.write("\t".join(["prompt"]+[f"model_answer_{i}" for i in range(args.num_return_sequences)])+"\n") for test_data in tqdm(test_dataset.post_list, desc="Prediction"): prompt = test_data['prompt'] prefix = test_data['prefix'] # label = dev_data['label'] if "chatglm" in args.model_name_or_path.lower(): encoded_prompt = tokenizer(prompt) prompt_length = len(encoded_prompt['input_ids']) inputs = tokenizer(prompt, max_length=min(prompt_length, args.max_length), truncation="only_first", return_tensors="pt") # max_gen_length = args.max_length - encoded_dict['input_ids'].shape[1] # inputs = tokenizer.build_inputs_for_generation(encoded_dict, # max_gen_length=max_gen_length, padding=True) inputs = inputs.to(device) outputs = model.generate(**inputs, max_new_tokens=args.max_length_generation, eos_token_id=tokenizer.eop_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) elif "glm" in args.model_name_or_path.lower(): encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token) prompt_length = len(encoded_prompt['input_ids']) encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token, max_length=min(prompt_length, args.max_length), truncation="only_first", return_tensors="pt", return_token_type_ids=False) max_gen_length = args.max_length - encoded_dict['input_ids'].shape[1] inputs = tokenizer.build_inputs_for_generation(encoded_dict, max_gen_length=max_gen_length, padding=True) inputs = inputs.to(device) outputs = model.generate(**inputs, max_new_tokens=min(args.max_length_generation, max_gen_length), eos_token_id=tokenizer.eop_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) else: inputs = tokenizer(prompt, tokenizer.sep_token + prefix, max_length=args.max_length, truncation="longest_first", add_special_tokens=False, return_tensors="pt", return_token_type_ids=False) # inputs = tokenizer(prompt, add_special_tokens=False, return_token_type_ids=False, return_tensors="pt") inputs = inputs.to(device) outputs = model.generate(**inputs, max_new_tokens=args.max_length_generation, pad_token_id=tokenizer.pad_token_id, do_sample=args.do_sample, num_return_sequences=args.num_return_sequences, top_k=args.top_k, top_p=args.top_p, temperature=args.temperature) results = tokenizer.batch_decode(outputs, skip_special_tokens=True) w.write("\t".join([prompt]+[result.split(prefix, maxsplit=1)[1] for result in results])+"\n") if __name__ == "__main__": main() ================================================ FILE: src/utils/__init__.py ================================================ from src.utils.config import * from src.utils.logger import logger from src.utils.nlp_utils import * from src.utils.file_utils import * from src.utils.loading import * from src.utils.method_configs import * from src.utils.modeling_utils import * ================================================ FILE: src/utils/config.py ================================================ import os import yaml from copy import deepcopy from dataclasses import dataclass, field from typing import Any, Dict, Optional, Set from src.utils.method_configs import MethodConfig, get_method, PPOConfig, SFTConfig, ILQLConfig ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) RESOURCE_PATH = os.path.join(ROOT_PATH, "resources") # -------- RLHF Config------- # def merge(base: Dict, update: Dict, updated: Set) -> Dict: "Recursively updates a nested dictionary with new values" for k, v in base.items(): if k in update and isinstance(v, dict): base[k] = merge(v, update[k], updated) updated.add(k) elif k in update: base[k] = update[k] updated.add(k) return base def _merge_dicts(base: Dict, update: Dict) -> Dict: "Merge two dictionaries recursively, returning a new dictionary." base = deepcopy(base) for k, v in update.items(): if isinstance(v, dict): base[k] = _merge_dicts(base.get(k, {}), v) else: base[k] = v return base @dataclass class ModelConfig: """ Config for a model. :param model_path: Path or name of the model (local or on huggingface hub) :type model_path: str :param model_arch_type: Type of model architecture. Either "causal" or "seq2seq" :type model_arch_type: str :param num_layers_unfrozen: Number of layers to unfreeze for fine-tuning. -1 means all layers are unfrozen. :type num_layers_unfrozen: int :param delta_kwargs: Keyword arguments for instantiating OpenDelta models for delta-tuning. Follow the `OpenDelta.AutoDeltaConfig` specification, e.g. for LoRA style tuning, set the `delta_type` to `lora` and include the model specific hyper-parameters (e.g. `lora_r`) {"delta_type": "lora", "modified_modules": "all", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.0} or in YAML format: delta_kwargs: delta_type: lora modified_modules: "all" lora_r: 8 lora_alpha: 16 lora_dropout: 0.0 See: https://opendelta.readthedocs.io/en/latest/modules/auto_delta.html#opendelta.auto_delta.AutoDeltaConfig :type delta_kwargs: Optional[Dict[str, Any]] """ model_path: str model_arch_type: str = "causal" num_layers_unfrozen: int = -1 delta_kwargs: Optional[Dict[str, Any]] = None @classmethod def from_dict(cls, config: Dict[str, Any]): return cls(**config) @dataclass class TokenizerConfig: """ Config for a model. :param tokenizer_path: Path or name of the tokenizer (local or on huggingface hub) :type tokenizer_path: str :param padding_side: Padding side :type padding_path: str :param truncation_side: Truncation side :type truncation_side: str """ tokenizer_path: str padding_side: str = "left" truncation_side: str = "right" @classmethod def from_dict(cls, config: Dict[str, Any]): return cls(**config) @dataclass class OptimizerConfig: """ Config for an optimizer. :param name: Name of the optimizer :type name: str :param kwargs: Keyword arguments for the optimizer (e.g. lr, betas, eps, weight_decay) :type kwargs: Dict[str, Any] """ name: str kwargs: Dict[str, Any] = field(default_factory=dict) @classmethod def from_dict(cls, config: Dict[str, Any]): return cls(**config) @dataclass class SchedulerConfig: """ Config for a learning rate scheduler. :param name: Name of the scheduler :type name: str :param kwargs: Keyword arguments for the scheduler instance (e.g. warmup_steps, T_max) :type kwargs: Dict[str, Any] """ name: str kwargs: Dict[str, Any] = field(default_factory=dict) @classmethod def from_dict(cls, config: Dict[str, Any]): return cls(**config) @dataclass class TrainConfig: """ Config for train job on model. :param total_steps: Total number of training steps :type total_steps: int :param seq_length: Number of tokens to use as context (max length for tokenizer) :type seq_length: int :param epochs: Total number of passes through data :type epochs: int :param batch_size: Batch size for training :type batch_size: int :param tracker: Tracker to use for logging. Default: "wandb" :type tracker: str :param checkpoint_interval: Save model every checkpoint_interval steps. Each checkpoint is stored in a sub-directory of the `TrainConfig.checkpoint_dir` directory in the format `checkpoint_dir/checkpoint_{step}`. :type checkpoint_interval: int :param eval_interval: Evaluate model every eval_interval steps :type eval_interval: int :param pipeline: Pipeline to use for training. One of the registered pipelines present in trlx.pipeline :type pipeline: str :param trainer: Trainer to use for training. One of the registered trainers present in trlx.trainer :type trainer: str :param trainer_kwargs: Extra keyword arguments for the trainer :type trainer: Dict[str, Any] :param project_name: Project name for wandb :type project_name: str :param entity_name: Entity name for wandb :type entity_name: str :param group_name: Group name for wandb (used for grouping runs) :type group_name: str :param checkpoint_dir: Directory to save checkpoints :type checkpoint_dir: str :param rollout_logging_dir: Directory to store generated rollouts for use in Algorithm Distillation. Only used by AcceleratePPOTrainer. :type rollout_logging_dir: Optional[str] :param save_best: Save best model based on mean reward :type save_best: bool :param seed: Random seed :type seed: int """ total_steps: int seq_length: int epochs: int batch_size: int checkpoint_interval: int eval_interval: int pipeline: str # One of the pipelines in framework.pipeline trainer: str # One of the trainers trainer_kwargs: Dict[str, Any] = field(default_factory=dict) # Extra keyword arguments for the trainer project_name: str = "trlx" entity_name: Optional[str] = None group_name: Optional[str] = None checkpoint_dir: str = "ckpts" rollout_logging_dir: Optional[str] = None save_best: bool = True tracker: Optional[str] = "wandb" logging_dir: Optional[str] = None lora_rank: Optional[int] = 0 lora_alpha: Optional[int] = 1 lora_train_bias: Optional[str] = "none" seed: int = 1000 @classmethod def from_dict(cls, config: Dict[str, Any]): return cls(**config) @dataclass class TRLConfig: """ Top level config for trlX. Loads configs and can be converted to dictionary. """ method: MethodConfig model: ModelConfig optimizer: OptimizerConfig scheduler: SchedulerConfig tokenizer: TokenizerConfig train: TrainConfig @classmethod def load_yaml(cls, yml_fp: str): """ Load yaml file as TRLConfig. :param yml_fp: Path to yaml file :type yml_fp: str """ with open(yml_fp, mode="r") as file: config = yaml.safe_load(file) return cls.from_dict(config) def to_dict(self): """ Convert TRLConfig to dictionary. """ data = { "method": self.method.__dict__, "model": self.model.__dict__, "optimizer": self.optimizer.__dict__, "scheduler": self.scheduler.__dict__, "tokenizer": self.tokenizer.__dict__, "train": self.train.__dict__, } return data def evolve(self, **kwargs) -> "TRLConfig": """ Evolve TRLConfig with new parameters. Can update nested parameters. >>> config = trlx.data.default_configs.default_ilql_config() >>> config = config.evolve(method=dict(gamma=0.99, gen_kwargs=dict(max_new_tokens=100)) >>> config.method.gamma 0.99 """ return TRLConfig.from_dict(_merge_dicts(self.to_dict(), kwargs)) @classmethod def from_dict(cls, config: Dict): """ Convert dictionary to TRLConfig. """ return cls( method=get_method(config["method"]["name"]).from_dict(config["method"]), model=ModelConfig.from_dict(config["model"]), tokenizer=TokenizerConfig.from_dict(config["tokenizer"]), optimizer=OptimizerConfig.from_dict(config["optimizer"]), scheduler=SchedulerConfig.from_dict(config["scheduler"]), train=TrainConfig.from_dict(config["train"]), ) @classmethod def update(cls, baseconfig: Dict, config: Dict): if not isinstance(baseconfig, Dict): baseconfig = baseconfig.to_dict() updates = set() merged = merge(baseconfig, config, updates) for param in config: if param not in updates: raise ValueError(f"parameter {param} is not present in the config (typo or a wrong config)") return cls.from_dict(merged) def __str__(self): """Returns a human-readable string representation of the config.""" import json return json.dumps(self.to_dict(), indent=4) def default_ppo_config(): return TRLConfig( train=TrainConfig( seq_length=1024, epochs=100, total_steps=10000, batch_size=32, checkpoint_interval=10000, eval_interval=100, pipeline="PromptPipeline", trainer="AcceleratePPOTrainer", ), model=ModelConfig(model_path="lvwerra/gpt2-imdb", num_layers_unfrozen=2), tokenizer=TokenizerConfig(tokenizer_path="gpt2", truncation_side="right"), optimizer=OptimizerConfig( name="adamw", kwargs=dict(lr=1.0e-4, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1.0e-6) ), scheduler=SchedulerConfig(name="cosine_annealing", kwargs=dict(T_max=10000, eta_min=1.0e-4)), method=PPOConfig( name="PPOConfig", num_rollouts=128, chunk_size=128, ppo_epochs=4, init_kl_coef=0.05, target=6, horizon=10000, gamma=1, lam=0.95, cliprange=0.2, cliprange_value=0.2, vf_coef=1, scale_reward="ignored", ref_mean=None, ref_std=None, cliprange_reward=10, gen_kwargs=dict( max_new_tokens=40, top_k=0, top_p=1.0, do_sample=True, ), ), ) def default_ilql_config(): return TRLConfig( train=TrainConfig( seq_length=64, batch_size=32, epochs=100, total_steps=1000, checkpoint_interval=1000, eval_interval=100, pipeline="PromptPipeline", trainer="AccelerateILQLTrainer", ), model=ModelConfig(model_path="gpt2", num_layers_unfrozen=-1), tokenizer=TokenizerConfig(tokenizer_path="gpt2", truncation_side="right"), optimizer=OptimizerConfig( name="adamw", kwargs=dict(lr=5.0e-5, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1.0e-6) ), scheduler=SchedulerConfig( name="cosine_annealing", kwargs=dict(T_max=1000, eta_min=5.0e-5) # train.total_steps ), method=ILQLConfig( name="ilqlconfig", tau=0.7, gamma=0.99, cql_scale=0.1, awac_scale=1, alpha=0.001, beta=0, steps_for_target_q_sync=5, two_qs=True, gen_kwargs=dict(max_new_tokens=56, top_k=20, beta=4, temperature=1.0), ), ) def default_sft_config(): return TRLConfig( train=TrainConfig( seq_length=1024, epochs=100, total_steps=1000, batch_size=8, checkpoint_interval=10000, eval_interval=100, pipeline="PromptPipeline", trainer="AccelerateSFTTrainer", ), model=ModelConfig(model_path="gpt2", num_layers_unfrozen=-1), tokenizer=TokenizerConfig(tokenizer_path="gpt2", truncation_side="right"), optimizer=OptimizerConfig( name="adamw", kwargs=dict(lr=1.0e-4, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1.0e-6) ), scheduler=SchedulerConfig( name="cosine_annealing", kwargs=dict(T_max=10000, eta_min=1.0e-4) # train.total_steps ), method=SFTConfig( name="sftconfig", gen_kwargs=dict(max_new_tokens=40, top_k=0, top_p=1.0, do_sample=True), ), ) def get_train_ds_config(global_batch_size=32, micro_batch_size=4, gradient_accumulation_steps=8, offload=False, stage=2, enable_hybrid_engine=False, inference_tp_size=1, release_inference_cache=False, pin_parameters=True, tp_gather_partition_size=8, max_out_tokens=512): device = "cpu" if offload else "none" zero_opt_dict = { "stage": stage, "offload_param": { "device": device }, "offload_optimizer": { "device": device }, "stage3_param_persistence_threshold": 1e4, "stage3_max_live_parameters": 3e7, "stage3_prefetch_bucket_size": 3e7, "memory_efficient_linear": False } return { "train_batch_size": global_batch_size, "train_micro_batch_size_per_gpu": micro_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, "steps_per_print": 10, "zero_optimization": zero_opt_dict, "fp16": { "enabled": True, "loss_scale_window": 100 }, "gradient_clipping": 1.0, "prescale_gradients": False, "wall_clock_breakdown": False, "hybrid_engine": { "enabled": enable_hybrid_engine, "max_out_tokens": max_out_tokens, "inference_tp_size": inference_tp_size, "release_inference_cache": release_inference_cache, "pin_parameters": pin_parameters, "tp_gather_partition_size": tp_gather_partition_size, } } def get_eval_ds_config(global_batch_size=32, micro_batch_size=4, gradient_accumulation_steps=8, offload=False, stage=0): device = "cpu" if offload else "none" zero_opt_dict = { "stage": stage, "stage3_param_persistence_threshold": 1e4, "offload_param": { "device": device }, "memory_efficient_linear": False } return { "train_batch_size": global_batch_size, "train_micro_batch_size_per_gpu": micro_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, "steps_per_print": 10, "zero_optimization": zero_opt_dict, "fp16": { "enabled": True }, "gradient_clipping": 1.0, "prescale_gradients": False, "wall_clock_breakdown": False } ================================================ FILE: src/utils/file_utils.py ================================================ import math import os import random import numpy as np import torch from numbers import Number from pynvml import * from src.utils.logger import logger def set_seed(seed_val=42): random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) def print_rank_0(*message): """ Print only once from the main rank """ if os.environ.get("RANK", "0") == "0": logger.info(*message) def significant(x: Number, ndigits=2) -> Number: """ Cut the number up to its `ndigits` after the most significant """ if isinstance(x, torch.Tensor): x = x.item() if not isinstance(x, Number) or math.isnan(x) or x == 0: return x return round(x, ndigits - int(math.floor(math.log10(abs(x))))) # # def set_seed(seed: int): # """ # Sets seeds across package dependencies for reproducibility. # """ # seed += int(os.environ.get("RANK", 0)) # random.seed(seed) # np.random.seed(seed) # torch.manual_seed(seed) # torch.cuda.manual_seed(seed) def print_gpu_utilization(prefix: str = "", index: int = 0, only_rank_0: bool = True): nvmlInit() handle = nvmlDeviceGetHandleByIndex(index) info = nvmlDeviceGetMemoryInfo(handle) memory_used = info.used / 1024**3 if only_rank_0: if index == 0: logger.info(f"[{prefix}] GPU-{index} memory occupied: {memory_used:.2f} GB") else: logger.info(f"[{prefix}] GPU-{index} memory occupied: {memory_used:.2f} GB") def print_gpu_utilization_torch(prefix: str = "", index: int = 0, only_rank_0: bool = True): memory_allocated = torch.cuda.memory_allocated() / 1024 ** 3 max_memory_allocated = torch.cuda.max_memory_allocated() / 1024 ** 3 memory_reserved = torch.cuda.memory_reserved() / 1024 ** 3 # max_memory_reserved = torch.cuda.max_memory_reserved() / 1024 ** 3 if only_rank_0: if index == 0: logger.info(f"[{prefix}] GPU-{index}: memory allocated: {memory_allocated:.2f} GB, " f"max memory allocated: {max_memory_allocated:.2f} GB, " f"memory reserved: {memory_reserved:.2f} GB, " # f"max memory reserved: {max_memory_allocated:.2f} GB" ) else: logger.info(f"[{prefix}] GPU-{index}: memory allocated: {memory_allocated:.2f} GB, " f"max memory allocated: {max_memory_allocated:.2f} GB, " f"memory reserved: {memory_reserved:.2f} GB, " # f"max memory reserved: {max_memory_reserved:.2f} GB" ) def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. """ trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() logger.info( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" ) ================================================ FILE: src/utils/loading.py ================================================ import types import torch import glob from typing import Callable, Dict, Tuple # Register load pipelines via module import from transformers import ( AutoConfig, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig, PreTrainedModel, PreTrainedTokenizer, ) from transformers.models.llama import LlamaModel, LlamaForCausalLM from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch from accelerate.utils import get_balanced_memory from peft import ( prepare_model_for_kbit_training, LoraConfig, get_peft_model ) from src.utils import print_trainable_parameters from src.utils.modeling_utils import _prepare_decoder_attention_mask from src.data.pipeline import _DATAPIPELINE from src.models.trainer import _TRAINERS, register_trainer # from trlx.pipeline.offline_pipeline import PromptPipeline # from trlx.trainer.accelerate_ilql_trainer import AccelerateILQLTrainer # from trlx.trainer.accelerate_ppo_trainer import AcceleratePPOTrainer # from trlx.trainer.accelerate_sft_trainer import AccelerateSFTTrainer try: from src.models.trainer import NeMoILQLTrainer except ImportError: # NeMo is not installed def _trainer_unavailble(name): def log_error(*args, **kwargs): raise ImportError(f"Unable to import NeMo so {name} is unavailable") return register_trainer(name)(log_error) _trainer_unavailble("NeMoILQLTrainer") def prepare_decoder_attention_mask(self, *args, **kwargs): return _prepare_decoder_attention_mask(*args, **kwargs) def chatglm_auto_configure_device_map(num_gpus: int, model_name: str, local_rank: int = 0) -> Dict[str, int]: # transformer.word_embeddings 占用1层 # transformer.final_layernorm 和 lm_head 占用1层 # transformer.layers 占用 28 层 # 总共30层分配到num_gpus张卡上 num_hidden_layers = 28 layers_per_gpu = (num_hidden_layers+2) // num_gpus layer_prefix = 'transformer' # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError # windows下 model.device 会被设置成 transformer.word_embeddings.device # linux下 model.device 会被设置成 lm_head.device # 在调用chat或者stream_chat时,input_ids会被放到model.device上 # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上 encode = "" if 'chatglm2' in model_name: device_map = { f"{layer_prefix}.embedding.word_embeddings": local_rank, f"{layer_prefix}.rotary_pos_emb": local_rank, f"{layer_prefix}.output_layer": local_rank, f"{layer_prefix}.encoder.final_layernorm": local_rank, f"base_model.model.output_layer": local_rank, } encode = ".encoder" else: device_map = { f'{layer_prefix}.word_embeddings': local_rank, f'{layer_prefix}.final_layernorm': local_rank, 'lm_head': local_rank, f'base_model.model.lm_head': local_rank, } used = 2 gpu_target = 0 # TODO: Assuming CUDA device index is consecutive, e.g. cuda:0, cuda:1, cuda:2 for i in range(num_hidden_layers): if used >= layers_per_gpu + (gpu_target % 2): gpu_target += 1 gpu_target %= num_gpus used = 0 device_map[f'{layer_prefix}{encode}.layers.{i}'] = gpu_target + local_rank used += 1 return device_map def llama_and_baichuan_auto_configure_device_map(num_gpus: int, model_name: str, local_rank: int = 0) -> Dict[str, int]: layer_prefix = 'model' # model.embed_tokens 占用1层 # model.norm 和 lm_head 占用1层 # model.layers 占用 num_hidden_layers 层 # 总共num_hidden_layers+2层分配到num_gpus张卡上 if "7b" in model_name.lower(): num_hidden_layers = 32 elif "13b" in model_name.lower(): num_hidden_layers = 40 else: raise ValueError(f"Only supports baichuan-7B, baichuan-13B, llama-7B and llama-13B, but {model_name} is provided") layers_per_gpu = (num_hidden_layers+2) // num_gpus device_map = { f'{layer_prefix}.embed_tokens': local_rank, f'{layer_prefix}.norm': local_rank, 'lm_head': local_rank, f'base_model.model.lm_head': local_rank, } used = 2 gpu_target = 0 # TODO: Assuming CUDA device index is consecutive, e.g. cuda:0, cuda:1, cuda:2 for i in range(num_hidden_layers): if used >= layers_per_gpu + (gpu_target % 2): gpu_target += 1 gpu_target %= num_gpus used = 0 device_map[f'{layer_prefix}.layers.{i}'] = gpu_target + local_rank used += 1 return device_map def load_params_8bit_or_4bit(args, model: PreTrainedModel) -> Dict: # init bnb config for quantization bf16 = torch.cuda.get_device_capability()[0] >= 8 if bf16: bnb_4bit_compute_dtype = torch.bfloat16 else: bnb_4bit_compute_dtype = torch.float16 bnb_config = BitsAndBytesConfig( load_in_8bit=args.bits == 8, load_in_4bit=args.bits == 4, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=bnb_4bit_compute_dtype ) params = { "low_cpu_mem_usage": True, 'quantization_config': bnb_config } # infer device map if args.multi_card: max_memory = get_balanced_memory(model, dtype=torch.int8, low_zero=False, no_split_module_classes=model._no_split_modules) params['device_map'] = infer_auto_device_map( model, dtype=torch.int8, no_split_module_classes=model._no_split_modules, max_memory=max_memory ) else: params['device_map'] = {"": args.local_rank} return params def load_tokenizer_and_model(args, with_trainer: bool = True) -> Tuple[PreTrainedTokenizer, PreTrainedModel, int]: # load tokenizer tokenizer_path = args.tokenizer_path if hasattr(args, "tokenizer_path") else args.model_name_or_path tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) # set eos token if "chatglm3" in args.model_name_or_path.lower(): eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), tokenizer.get_command("<|observation|>")] elif "chatglm2" in args.model_name_or_path.lower(): eos_token_id = tokenizer.get_command("eop") if args.checkpoint is not None else tokenizer.get_command("") elif "chatglm1_1" in args.model_name_or_path.lower(): eos_token_id = tokenizer.eos_token_id elif "chatglm" in args.model_name_or_path.lower(): eos_token_id = tokenizer.eop_token_id elif "baichuan" in args.model_name_or_path.lower(): eos_token_id = tokenizer.bos_token_id if args.checkpoint is not None else tokenizer.eos_token_id tokenizer.pad_token_id = tokenizer.eos_token_id else: eos_token_id = tokenizer.eos_token_id if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id # load model if "chatglm" in args.model_name_or_path.lower(): model_class = AutoModelForSeq2SeqLM else: model_class = AutoModelForCausalLM if torch.cuda.is_available(): dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 else: dtype = torch.float32 params = { "trust_remote_code": True, "torch_dtype": dtype, "load_in_8bit": hasattr(args, "bits") and args.bits == 8, "load_in_4bit": hasattr(args, "bits") and args.bits == 4, # "quantization_config": bnb_config, } if with_trainer: params["device_map"] = args.device_map params["low_cpu_mem_usage"] = args.low_cpu_mem_usage model = model_class.from_pretrained(args.model_name_or_path, **params) # # cpu # if not torch.cuda.is_available(): # model = model_class.from_pretrained(args.model_name_or_path, # trust_remote_code=True) # # 8bit or 4bit # elif hasattr(args, "bits") and args.bits in [4, 8]: # config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True) # model = model_class.from_config(config, trust_remote_code=True) # params = load_params_8bit_or_4bit(args, model) # model = model_class.from_pretrained(args.model_name_or_path, # trust_remote_code=True, # **params) # if args.do_train: # if args.gradient_checkpointing: # model.gradient_checkpointing_enable() # model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing) # # multi gpu card # elif hasattr(args, "multi_card") and args.multi_card: # with init_empty_weights(): # config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True) # model = model_class.from_config(config, trust_remote_code=True).half() # model.tie_weights() # if "llama" in args.model_name_or_path.lower() or \ # "baichuan" in args.model_name_or_path.lower() or \ # "vicuna" in args.model_name_or_path.lower(): # device_map = llama_and_baichuan_auto_configure_device_map( # torch.cuda.device_count(), # args.model_name_or_path.lower(), # args.local_rank # ) # elif "chatglm" in args.model_name_or_path.lower(): # device_map = chatglm_auto_configure_device_map( # torch.cuda.device_count(), # args.model_name_or_path.lower(), # args.local_rank # ) # else: # # max_memory = get_balanced_memory(model, dtype=torch.float16, low_zero=False, # # no_split_module_classes=model._no_split_modules) # # device_map = infer_auto_device_map(model, dtype=torch.float16, max_memory=max_memory, # # no_split_module_classes=model._no_split_modules) # device_map = "auto" # # model = load_checkpoint_and_dispatch(model, # checkpoint=args.model_name_or_path, # device_map=device_map, # no_split_module_classes=model._no_split_modules, # dtype=torch.float16) # # single gpu card # else: # model = model_class.from_pretrained(args.model_name_or_path, # trust_remote_code=True, # torch_dtype=torch.float16, # device_map={"": args.local_rank}) # post-loading operations if hasattr(args, "concat_samples") and args.concat_samples and isinstance(model, LlamaForCausalLM): prepare_decoder_attention_mask_method = types.MethodType(prepare_decoder_attention_mask, model.model) setattr(model.model, "_prepare_decoder_attention_mask", prepare_decoder_attention_mask_method) # funcType = type(LlamaModel._prepare_decoder_attention_mask) # model.model._prepare_decoder_attention_mask = funcType(prepare_decoder_attention_mask, model.model, LlamaModel) if "pangu" in args.model_name_or_path.lower(): model.resize_token_embeddings(tokenizer.vocab_size) if hasattr(args, "bits") and args.bits in [4, 8] and args.do_train: if args.gradient_checkpointing: model.gradient_checkpointing_enable() model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing) # init peft model (if necessary) if hasattr(args, "lora_rank") and args.lora_rank > 0: model = to_peft_model(args, model) return tokenizer, model, eos_token_id def to_peft_model(args, model: PreTrainedModel) -> PreTrainedModel: if "llama" in args.model_name_or_path.lower() or \ "vicuna" in args.model_name_or_path.lower() or \ "billa" in args.model_name_or_path.lower() or \ "atomgpt" in args.model_name_or_path.lower() or \ "pangu" in args.model_name_or_path.lower(): target_modules = ["q_proj", "k_proj", "v_proj"] task_type = "CAUSAL_LM" elif "baichuan" in args.model_name_or_path.lower(): target_modules = ["W_pack"] task_type = "CAUSAL_LM" elif "bloom" in args.model_name_or_path.lower() or "tigerbot" in args.model_name_or_path.lower(): target_modules = ["query_key_value"] task_type = "CAUSAL_LM" elif "glm" in args.model_name_or_path.lower(): target_modules = ["query_key_value"] task_type = "SEQ_2_SEQ_LM" else: raise ValueError(f"Unsupported model name: {args.model_name_or_path}") config = LoraConfig( r=args.lora_rank, lora_alpha=args.lora_alpha, target_modules=target_modules, lora_dropout=0.05, bias=args.lora_train_bias, task_type=task_type ) model.enable_input_require_grads() model = get_peft_model(model, config) print_trainable_parameters(model) return model def load_checkpoint(args, model: PreTrainedModel, strict: bool = True) -> None: checkpoints = glob.glob(args.checkpoint.replace("star", "*")) st = dict() for checkpoint in checkpoints: st.update(torch.load(checkpoint, map_location="cpu")) model.load_state_dict(st, strict=strict) del st def get_trainer(name: str) -> Callable: """ Return constructor for specified RL model trainer """ name = name.lower() if name in _TRAINERS: return _TRAINERS[name] else: raise Exception("Error: Trying to access a trainer that has not been registered") def get_pipeline(name: str) -> Callable: """ Return constructor for specified pipeline """ name = name.lower() if name in _DATAPIPELINE: return _DATAPIPELINE[name] else: raise Exception("Error: Trying to access a pipeline that has not been registered") ================================================ FILE: src/utils/logger.py ================================================ import os import logging.config from src.utils.config import RESOURCE_PATH logging.config.fileConfig(os.path.join(RESOURCE_PATH, "config", "log.conf"), disable_existing_loggers=False) logger = logging.getLogger(__name__) ================================================ FILE: src/utils/method_configs.py ================================================ import sys import torch import torch.nn.functional as F from functools import reduce from dataclasses import dataclass from typing import Any, Dict, Optional, Tuple from torchtyping import TensorType from src.utils.modeling_utils import ( flatten_dict, get_tensor_stats, whiten, ) # specifies a dictionary of method configs _METHODS: Dict[str, Any] = {} # registry def register_method(name): """Decorator used register a method config Args: name: Name of the method """ def register_class(cls, name): _METHODS[name] = cls setattr(sys.modules[__name__], name, cls) return cls if isinstance(name, str): name = name.lower() return lambda c: register_class(c, name) cls = name name = cls.__name__ register_class(cls, name.lower()) return cls @dataclass @register_method class MethodConfig: """ Config for a certain RL method. :param name: Name of the method :type name: str """ name: str @classmethod def from_dict(cls, config: Dict[str, Any]): return cls(**config) def get_method(name: str) -> MethodConfig: """ Return constructor for specified method config """ name = name.lower() if name in _METHODS: return _METHODS[name] else: raise Exception("Error: Trying to access a method that has not been registered") # PPO Configs @dataclass @register_method class PPOConfig(MethodConfig): """ Config for PPO method :param ppo_epochs: Number of updates per batch :type ppo_epochs: int :param num_rollouts: Number of experiences to observe before learning :type num_rollouts: int :param init_kl_coef: Initial value for KL coefficient :type init_kl_coef: float :param target: Target value for KL coefficient :type target: float :param horizon: Number of steps for KL coefficient to reach target :type horizon: int :param gamma: Discount factor :type gamma: float :param lam: GAE lambda :type lam: float :param cliprange: Clipping range for PPO policy loss (1 - cliprange, 1 + cliprange) :type cliprange: float :param cliprange_value: Clipping range for predicted values (observed values - cliprange_value, observed values + cliprange_value) :type cliprange_value: float :param vf_coef: Value loss scale w.r.t policy loss :type vf_coef: float :param gen_kwargs: Additioanl kwargs for the generation :type gen_kwargs: Dict[str, Any] :param gen_experience_kwargs: if this is not None, then the experience is generated using this :type gen_experience_kwargs: Dict[str, Any] """ ppo_epochs: int num_rollouts: int chunk_size: int init_kl_coef: float target: float horizon: int gamma: float lam: float cliprange: float cliprange_value: float vf_coef: float scale_reward: Optional[str] ref_mean: Optional[float] ref_std: Optional[float] cliprange_reward: float gen_kwargs: dict gen_experience_kwargs: Optional[dict] = None def get_advantages_and_returns( self, values: TensorType["batch_size", "response_size"], rewards: TensorType["batch_size", "response_size"], response_length: int, use_whitening: Optional[bool] = True, ) -> Tuple[torch.Tensor, torch.Tensor]: """Function that computes advantages and returns from rewards and values. Calculated as in the original PPO paper: https://arxiv.org/abs/1707.06347 Note that rewards may include a KL divergence loss term. Advantages looks like this: Adv1 = R1 + γ * λ * R2 + γ^2 * λ^2 * R3 + ... - V1 + γ * (1 - λ) V2 + γ^2 * λ * (1 - λ) V3 + ... Returns looks like this: Ret1 = R1 + γ * λ * R2 + γ^2 * λ^2 * R3 + ... + γ * (1 - λ) V2 + γ^2 * λ * (1 - λ) V3 + ... Args: values: Tensor of shape (batch_size, response_size) rewards: Tensor of shape (batch_size, response_size) response_length: Length of the response sequence use_whitening: Whether to use whitening (ie. normalize advantages) or not """ lastgaelam = 0 advantages_reversed = [] for t in reversed(range(response_length)): nextvalues = values[:, t + 1] if t < response_length - 1 else 0.0 delta = rewards[:, t] + self.gamma * nextvalues - values[:, t] lastgaelam = delta + self.gamma * self.lam * lastgaelam advantages_reversed.append(lastgaelam) advantages = torch.stack(advantages_reversed[::-1], dim=1) returns = advantages + values if use_whitening: advantages = whiten(advantages) return advantages.detach(), returns def loss( self, logprobs: TensorType["batch_size", "response_size"], values: TensorType["batch_size", "response_size"], old_logprobs: TensorType["batch_size", "response_size"], old_values: TensorType["batch_size", "response_size"], advantages: TensorType["batch_size", "response_size"], returns: TensorType["batch_size", "response_size"], mask: TensorType["batch_size", "response_size"], ): """PPO objective function. References: - https://stable-baselines.readthedocs.io/en/master/modules/ppo2.html """ print(f"[ppo loss] values shape: {values.shape}, old_values shape: {old_values.shape}") values_clipped = torch.clamp( values, old_values - self.cliprange_value, old_values + self.cliprange_value, ) n = mask.sum() vf_loss1 = (values - returns) ** 2 vf_loss2 = (values_clipped - returns) ** 2 vf_loss = 0.5 * torch.sum(torch.max(vf_loss1, vf_loss2) * mask) / n vf_clipfrac = torch.sum((vf_loss2 > vf_loss1).float() * mask) / n log_ratio = (logprobs - old_logprobs) * mask ratio = torch.exp(log_ratio) # Unbiased KL-div estimates (`k3`). Ref: http://joschu.net/blog/kl-approx.html with torch.no_grad(): approx_kl = torch.mean((ratio - 1) - log_ratio) pg_loss1 = -advantages * ratio pg_loss2 = -advantages * torch.clamp( ratio, 1.0 - self.cliprange, 1.0 + self.cliprange, ) pg_loss = torch.sum(torch.max(pg_loss1, pg_loss2) * mask) / n pg_clipfrac = torch.sum((pg_loss2 > pg_loss1).float() * mask) / n loss = pg_loss + self.vf_coef * vf_loss stats = dict( losses=dict( total_loss=loss.item(), policy_loss=pg_loss.item(), value_loss=vf_loss.item(), ), values=dict( get_tensor_stats(values, mask, n), values_error=torch.sum(((values - returns) * mask) ** 2) / n, clipfrac=vf_clipfrac, ), old_values=get_tensor_stats(old_values, mask, n), returns=get_tensor_stats(returns, mask, n), policy=dict(approx_kl=approx_kl.item(), clipfrac=pg_clipfrac.item()), ratio=(ratio * mask).sum() / n, padding_percentage=n / mask.numel(), ) return loss, flatten_dict(stats) @dataclass @register_method class SFTConfig(MethodConfig): """ Config for SFT training :param gen_kwargs: kwargs for generation :type gen_kwargs: Dict[str, Any] """ gen_kwargs: dict @dataclass @register_method class ILQLConfig(MethodConfig): tau: float gamma: float cql_scale: float awac_scale: float alpha: float beta: float steps_for_target_q_sync: float two_qs: bool gen_kwargs: dict def loss(self, outputs, labels): logits, (qs, target_qs, vs) = outputs terminal_mask = labels.dones[:, :-1] n_nonterminal = max(1, terminal_mask.sum()) # check type of labels if isinstance(labels, ILQLBatch): actions = labels.input_ids[:, 1:].gather(dim=1, index=labels.actions_ixs).unsqueeze(-1) else: actions = labels.decoder_input_ids[:, 1:].unsqueeze(-1) nactions = actions.shape[1] bsize, _, dsize = logits.shape Q = [q.gather(-1, actions).squeeze(-1) for q in qs] targetQs = [q.gather(-1, actions).squeeze(-1).detach() for q in target_qs] targetQ = reduce(torch.minimum, targetQs) # values of current states V = vs[:, :-1].squeeze() # values of next states Vnext = vs[:, 1:].squeeze() * labels.dones[:, 1:] # target to fit Q Q_ = labels.rewards + self.gamma * Vnext.detach() loss_qs = [((Qi - Q_) * terminal_mask).pow(2).sum() / n_nonterminal for Qi in Q] loss_q = sum(loss_qs) targetQ = targetQ.detach() loss_v = ( ( (targetQ >= V).int() * self.tau * (targetQ - V).pow(2) + (targetQ < V).int() * (1 - self.tau) * (targetQ - V).pow(2) ) * terminal_mask ).sum() / n_nonterminal def cql_loss(q): loss = F.cross_entropy(q.reshape(-1, dsize), actions.reshape(-1), reduction="none") loss = loss.reshape(bsize, nactions) * terminal_mask loss = loss.sum() / n_nonterminal return loss loss_cql = sum(cql_loss(q) for q in qs) # select logits from continuations action_logits = batched_index_select(logits, labels.actions_ixs, dim=1) cross_entropy = F.cross_entropy( action_logits.reshape(-1, dsize), actions.reshape(-1), reduction="none", ).reshape(bsize, nactions) with torch.no_grad(): awac_weight = torch.exp(self.beta * (targetQ - V)) loss_awac = torch.sum(cross_entropy * awac_weight * terminal_mask) / n_nonterminal loss = loss_q + loss_v + self.cql_scale * loss_cql + self.awac_scale * loss_awac stats = dict( losses=dict( loss=loss.item(), loss_q=loss_q.item(), loss_v=loss_v.item(), loss_cql=loss_cql.item(), loss_awac=loss_awac.item(), ), values=get_tensor_stats(V, terminal_mask, n_nonterminal), qvalues={str(ix): get_tensor_stats(Q[ix], terminal_mask, n_nonterminal) for ix in range(len(Q))}, awac_weight=get_tensor_stats(awac_weight, terminal_mask, n_nonterminal), ) return loss, flatten_dict(stats) ================================================ FILE: src/utils/modeling_utils.py ================================================ import functools from typing import Any, Dict, List, MutableMapping, Tuple, Union, Optional import os import subprocess import time import numpy as np import re import shutil import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import transformers import deepspeed from pathlib import Path from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus from dataclasses import is_dataclass from enum import Enum from accelerate import Accelerator from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR from transformers import PreTrainedTokenizer try: from opendelta import ( AdapterModel, BitFitModel, LoraModel, PrefixModel, SoftPromptModel, ) HAS_OPENDELTA = True except ModuleNotFoundError: HAS_OPENDELTA = False # Copied from transformers.models.bart.modeling_bart._make_causal_mask def _make_causal_mask( input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 ): """ Make causal mask used for bi-directional self-attention. """ bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) mask_cond = torch.arange(mask.size(-1), device=device) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) # Copied from transformers.models.bart.modeling_bart._expand_mask def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) def _prepare_decoder_attention_mask(attention_mask, input_shape, input_embeds, past_key_values_length): # create causal mask if attention_mask is not None: # attention mask is already given in 3d shape if len(attention_mask.shape) == 3: return attention_mask.unsqueeze(1).to(input_embeds.device) # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] combined_attention_mask = None if input_shape[-1] > 1: combined_attention_mask = _make_causal_mask( input_shape, input_embeds.dtype, device=input_embeds.device, past_key_values_length=past_key_values_length, ) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] expanded_attn_mask = _expand_mask(attention_mask, input_embeds.dtype, tgt_len=input_shape[-1]).to( input_embeds.device ) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask ) return combined_attention_mask def get_distributed_config(accelerator: Accelerator): """ Return accelerator distributed config """ dist_config = { "mixed_precision": accelerator.mixed_precision, "num_gpus": accelerator.num_processes, } if accelerator.state.deepspeed_plugin is not None: ds_plugin = accelerator.state.deepspeed_plugin dist_config.update( { "gradient_accumulation_steps": ds_plugin.gradient_accumulation_steps, "gradient_clipping": ds_plugin.gradient_clipping, "zero_stage": ds_plugin.zero_stage, "offload_optimizer_device": ds_plugin.offload_optimizer_device, "offload_param_device": ds_plugin.offload_param_device, } ) return dist_config class OptimizerName(str, Enum): """Supported optimizer names""" ADAM: str = "adam" ADAMW: str = "adamw" ADAM_8BIT_BNB: str = "adam_8bit_bnb" ADAMW_8BIT_BNB: str = "adamw_8bit_bnb" SGD: str = "sgd" def get_optimizer_class(name: OptimizerName): """ Returns the optimizer class with the given name Args: name (str): Name of the optimizer as found in `OptimizerNames` """ if name == OptimizerName.ADAM: return torch.optim.Adam if name == OptimizerName.ADAMW: return torch.optim.AdamW if name == OptimizerName.ADAM_8BIT_BNB.value: try: from bitsandbytes.optim import Adam8bit return Adam8bit except ImportError: raise ImportError( "You must install the `bitsandbytes` package to use the 8-bit Adam. " "Install with: `pip install bitsandbytes`" ) if name == OptimizerName.ADAMW_8BIT_BNB.value: try: from bitsandbytes.optim import AdamW8bit return AdamW8bit except ImportError: raise ImportError( "You must install the `bitsandbytes` package to use 8-bit AdamW. " "Install with: `pip install bitsandbytes`" ) if name == OptimizerName.SGD.value: return torch.optim.SGD supported_optimizers = [o.value for o in OptimizerName] raise ValueError(f"`{name}` is not a supported optimizer. " f"Supported optimizers are: {supported_optimizers}") class SchedulerName(str, Enum): """Supported scheduler names""" COSINE_ANNEALING = "cosine_annealing" LINEAR = "linear" def get_scheduler_class(name: SchedulerName): """ Returns the scheduler class with the given name """ if name == SchedulerName.COSINE_ANNEALING: return CosineAnnealingLR if name == SchedulerName.LINEAR: return LinearLR supported_schedulers = [s.value for s in SchedulerName] raise ValueError(f"`{name}` is not a supported scheduler. " f"Supported schedulers are: {supported_schedulers}") class Clock: """ Helper object for keeping track of time for computations. """ def __init__(self): self.start = time.time() self.total_time = 0 self.total_samples = 0 def tick(self, samples: int = 0) -> float: """ Returns time (s) since last call to tick(). Also records samples processed since last call. :param samples: number of samples that have been processed since last call """ end = time.time() delta = end - self.start self.start = end if samples != 0: self.total_time += delta self.total_samples += samples return delta def get_stat(self, n_samp: int = 1000, reset: bool = False): """ Returns average time (s) per n_samp samples processed :param reset: Reset counts? """ sec_per_samp = self.total_time / self.total_samples if reset: self.total_samples = 0 self.total_time = 0 return sec_per_samp * n_samp def tree_map(f, tree: Any) -> Any: """ Apply function f to all leaves in tree """ if is_dataclass(tree): return tree.__class__(**{k: tree_map(f, v) for k, v in tree.__dict__.items()}) elif isinstance(tree, dict): return {k: tree_map(f, v) for k, v in tree.items()} elif isinstance(tree, (list, tuple)): return tree.__class__(tree_map(f, v) for v in tree) else: return f(tree) def to_device(tree, device, non_blocking=False): """ Move all tensors in tree to device """ return tree_map(lambda x: x.to(device, non_blocking=non_blocking), tree) def filter_non_scalars(xs: Dict) -> Dict: """ Trims everything that can't be casted to float """ ys = {} for k, v in xs.items(): try: ys[k] = float(v) except TypeError: continue return ys def get_git_tag() -> Tuple[str, str]: """ Returns commit's short hash and date """ try: output = subprocess.check_output("git log --format='%h/%as' -n1".split()) branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD".split()) return branch.decode()[:-1], output.decode()[1:-2] except subprocess.CalledProcessError: return "unknown", "unknown" def make_head(n_embd: int, out: int, dtype: type = torch.float32) -> nn.Sequential: """Returns a generic sequential MLP head.""" return nn.Sequential( nn.Linear(n_embd, n_embd * 2, dtype=dtype), nn.ReLU(), nn.Linear(n_embd * 2, out, dtype=dtype), ) def freeze_bottom_causal_layers(model: nn.Module, num_layers_unfrozen: int = 0): """Freezes the bottom transformer block layers of the specified model.""" hidden_layers = hf_get_decoder_blocks(model) if num_layers_unfrozen == 0: hidden_layers_to_freeze = list(hidden_layers) elif num_layers_unfrozen > 0: hidden_layers_to_freeze = list(hidden_layers)[:-num_layers_unfrozen] else: hidden_layers_to_freeze = [] for layer in hidden_layers_to_freeze: layer.requires_grad_(False) def freeze_bottom_seq2seq_layers(model: nn.Module, num_layers_unfrozen: int = 0): """Freezes the bottom transformer block layers of the specified model.""" if num_layers_unfrozen == -1: return shared_embed = model.shared decoder_embed = model.decoder.embed_tokens encoder_blocks = model.encoder.block encoder_norm_layer = model.encoder.final_layer_norm decoder_norm_layer = model.decoder.final_layer_norm decoder_blocks = model.decoder.block[:-num_layers_unfrozen] blocks_to_freeze = ( list(encoder_blocks) + list(decoder_blocks) + [shared_embed] + [encoder_norm_layer] + [decoder_norm_layer] + [decoder_embed] ) for block in blocks_to_freeze: block.requires_grad_(False) def rhasattr(obj, attr): """A chain-able attribute version of hasattr. For example, to check if `obj` has the attribute `foo.bar.baz`, you can use: `rhasattr(obj, "foo.bar.baz")` Reference: https://stackoverflow.com/a/67303315 """ _nested_attrs = attr.split(".") _curr_obj = obj for _a in _nested_attrs[:-1]: if hasattr(_curr_obj, _a): _curr_obj = getattr(_curr_obj, _a) else: return False return hasattr(_curr_obj, _nested_attrs[-1]) def rgetattr(obj, attr: str, *args) -> object: """A chain-able attribute version of getattr. For example, to get the attribute `foo.bar.baz` from `obj`, you can use: `rgetattr(obj, "foo.bar.baz")` Reference: https://stackoverflow.com/a/31174427 """ def _getattr(obj, attr): return getattr(obj, attr, *args) return functools.reduce(_getattr, [obj] + attr.split(".")) def findattr(obj, attrs: Tuple[str]) -> Union[object, None]: for attr in attrs: if rhasattr(obj, attr): return rgetattr(obj, attr) raise ValueError(f"Could not find an attribute from `{attrs}` in `{obj}`") def hf_get_decoder(model: nn.Module) -> nn.Module: """Returns the causal decoder backbone of the specified HuggingFace transformers model. NOTE: Different model configurations have different causal decoder attribute names. - transformer: (GPT2LMHeadModel, GPTJConfig) - model.decoder: (OPTConfig, BloomConfig) - gpt_neox: (GPTNeoXConfig) """ decoder_attrs = ("transformer", "model.decoder", "gpt_neox", "decoder") return findattr(model, decoder_attrs) def hf_get_decoder_final_norm(model: nn.Module) -> float: """Returns the final (layer) norm of the specified decoder. NOTE: Different model configurations have different final norm attribute names. - transformer.ln_f: (GPT2LMHeadModel, GPTJForCausalLM) - model.decoder.final_layer_norm: (OPTForCausalLM) - gpt_neox.layers.final_layer_norm: (GPTNeoXForCausalLM) """ norm_attrs = ( "transformer.ln_f", "model.decoder.final_layer_norm", "decoder.final_layer_norm", "gpt_neox.final_layer_norm", ) return findattr(model, norm_attrs) def hf_get_decoder_blocks(model: nn.Module) -> Tuple[nn.Module]: """Returns the decoder hidden layers of the specified model. NOTE: Different model configurations have different hidden layer attribute names. - transformer.h: (BloomForCausalLM, GPT2LMHeadModel, GPTJForCausalLM) - model.decoder.layers: (OPTForCausalLM) - gpt_neox.layers: (GPTNeoXForCausalLM) - decoder.block: (T5ForConditionalGeneration) """ hidden_layers_attrs = ( "h", "layers", "decoder.layers", "transformer.h", "model.decoder.layers", "gpt_neox.layers", "decoder.block", "glm.transformer.layers" ) return findattr(model, hidden_layers_attrs) def hf_get_lm_head(model: nn.Module) -> nn.Module: """Returns the language modeling (lm) head of the specified HuggingFace transformers model. NOTE: Different model configurations have different `lm_head` attribute names. - lm_head: (GPT2LMHeadModel, BloomForCausalLM) - embed_out: (GPTNeoXForCausalLM) """ return model.get_output_embeddings() def hf_get_hidden_size(config: transformers.PretrainedConfig) -> int: """Returns the hidden layer dimensionality of the model architecture specified by the HuggingFace transformers config. NOTE: Different model configurations have different hidden size attribute names. - hidden_size: (OPTConfig, BloomConfig) - n_embd: (GPT2Config, GPTJConfig) - d_model: (PegasusConfig, XLNetConfig) """ hidden_size_attrs = ("hidden_size", "n_embd", "d_model") return findattr(config, hidden_size_attrs) def hf_get_num_hidden_layers(config: transformers.PretrainedConfig) -> int: """Returns the number of hidden layers in the model architecture specified by the HuggingFace transformers config. NOTE: Different model configurations have different number-of-layers attribute names. - num_hidden_layers: (GPTNeoXConfig, OPTConfig) - n_layer: (GPT2Config, GPTJConfig, BloomConfig) """ num_hidden_layers_attrs = ("num_hidden_layers", "n_layer") return findattr(config, num_hidden_layers_attrs) def get_global_statistics(xs: torch.Tensor) -> Tuple[float, float, int]: """ Computes element-wise mean and variance of the tensor across processes """ sum_and_count = torch.tensor([xs.sum(), xs.numel()], device=xs.device) dist.all_reduce(sum_and_count, dist.ReduceOp.SUM) global_sum, count = sum_and_count global_mean = global_sum / count sum_var = torch.sum((xs - global_mean) ** 2) dist.all_reduce(sum_var, dist.ReduceOp.SUM) global_var = sum_var / count return global_mean, global_var, count def whiten(xs: torch.Tensor, shift_mean=True, distributed=True) -> torch.Tensor: """Whitens values""" if distributed and dist.is_initialized(): mean, var, _ = get_global_statistics(xs) else: var, mean = torch.var_mean(xs) whitened = (xs - mean) * torch.rsqrt(var + 1e-8) if not shift_mean: whitened += mean return whitened def logprobs_of_labels(logits, labels): """Log probabilities of the labels These are calculated from the logits.""" logprobs = F.log_softmax(logits, dim=-1) logprobs_labels = torch.gather(logprobs, dim=-1, index=labels.unsqueeze(-1)) return logprobs_labels.squeeze(-1) def flatten_dict( d: Union[dict, MutableMapping], parent_key: str = "", sep: str = "/", ) -> dict: # From: https://stackoverflow.com/a/6027615 items = [] for k, v in d.items(): new_key = parent_key + sep + k if parent_key else k if isinstance(v, MutableMapping): items.extend(flatten_dict(v, new_key, sep=sep).items()) else: items.append((new_key, v)) return dict(items) def get_tensor_stats(xs: torch.Tensor, mask: torch.Tensor, n: int): mean = (xs * mask).sum() / n return dict( mean=mean, min=torch.where(mask.bool(), xs, np.inf).min(), max=torch.where(mask.bool(), xs, -np.inf).max(), std=torch.sqrt(((xs - mean) * mask).pow(2).sum() / n), ) class RunningMoments: def __init__(self): """ Calculates the running mean and standard deviation of a data stream. Modified version of https://github.com/DLR-RM/stable-baselines3/blob/a6f5049a99a4c21a6f0bcce458ca3306cef310e0/stable_baselines3/common/running_mean_std.py """ self.mean = 0 self.std = 1 self.var = 1 self.count = 1e-24 def update(self, xs: torch.Tensor) -> Tuple[float, float]: """Updates running moments from batch's moments computed across ranks""" if dist.is_initialized(): xs_mean, xs_var, xs_count = get_global_statistics(xs) else: xs_count = xs.numel() xs_var, xs_mean = torch.var_mean(xs, unbiased=False) delta = xs_mean - self.mean tot_count = self.count + xs_count new_sum = xs_var * xs_count # correct old_sum deviation accounting for the new mean old_sum = self.var * self.count + delta**2 * self.count * xs_count / tot_count tot_sum = old_sum + new_sum self.mean += delta * xs_count / tot_count self.var = tot_sum / tot_count self.std = (self.var * tot_count / (tot_count - 1)).sqrt() self.count = tot_count return xs_mean, (xs_var * xs_count / (xs_count - 1)).sqrt() # OpenDelta utilities MODIFIED_MODULES_DICT = { "gptj": { "attention": ["attn.q_proj", "attn.k_proj", "attn.v_proj"], "mlp": ["mlp.fc_in", "mlp.fc_out"], "all": [ "attn.q_proj", "attn.k_proj", "attn.v_proj", "attn.out_proj", "mlp.fc_in", "mlp.fc_out", ], }, "gpt_neox": { "attention": ["attention.query_key_value"], "mlp": ["mlp.dense_h_to_4h", "mlp.dense_4h_to_h"], "all": [ "attention.query_key_value", "attention.dense", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h", ], }, "opt": { "attention": [ "self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj", "self_attn.out_proj", ], "mlp": ["fc1", "fc2"], "all": [ "self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj", "self_attn.out_proj", "fc1", "fc2", ], }, "bloom": { "attention": ["self_attention.query_key_value", "self_attention.dense"], "mlp": ["mlp.dense_h_to_4h", "mlp.dense_4h_to_h"], "all": [ "self_attention.query_key_value", "self_attention.dense", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h", ], }, "t5": { "attention": [ "layer.0.SelfAttention.q", "layer.0.SelfAttention.k", "layer.0.SelfAttention.v", "layer.0.SelfAttention.o", "layer.1.EncDecAttention.q", "layer.1.EncDecAttention.k", "layer.1.EncDecAttention.v", "layer.1.EncDecAttention.o", ], "mlp": [ "layer.2.DenseReluDense.wo", "layer.2.DenseReluDense.wi_0", "layer.2.DenseReluDense.wi_1", ], "all": [ "layer.0.SelfAttention.q", "layer.0.SelfAttention.k", "layer.0.SelfAttention.v", "layer.0.SelfAttention.o", "layer.1.EncDecAttention.q", "layer.1.EncDecAttention.k", "layer.1.EncDecAttention.v", "layer.1.EncDecAttention.o", "layer.2.DenseReluDense.wo", "layer.2.DenseReluDense.wi_0", "layer.2.DenseReluDense.wi_1", ], }, } def generate_layer_regex(config: transformers.PretrainedConfig, num_layers_unfrozen: int = -1) -> str: """Generates a regex range for the specified number of learnable layers.""" if num_layers_unfrozen == -1: return "(\d)+." num_hidden_layers = hf_get_num_hidden_layers(config) start_layer = num_hidden_layers - num_layers_unfrozen if start_layer < 0: raise Exception("Number of layers unfrozen cannot be greater than number of layers in the model") pattern = f"(?:{regex_for_range(start_layer, num_hidden_layers - 1)})." return f"{pattern}" def get_delta_modified_modules( config: transformers.PretrainedConfig, modified_modules: List[str], num_layers_unfrozen: int = -1, ) -> List[str]: """Returns a list of module names to be modified for a given delta method with the specified number of learnable layers.""" unfrozen_layers_pattern = generate_layer_regex(config, num_layers_unfrozen) # [r] for regex as per https://github.com/thunlp/OpenDelta/blob/main/opendelta/utils/name_based_addressing.py#L20 regex_prefix = "[r]" # TODO (jon-tow): `decoder.block.` is hardcoded to support T5 layer naming. decoder_prefix = "decoder.block." if config.is_encoder_decoder else "" module_list = [regex_prefix + decoder_prefix + unfrozen_layers_pattern + module for module in modified_modules] return module_list def get_delta_model_class(model_type: str): if not HAS_OPENDELTA: raise ValueError("OpenDelta package required to train with delta models. https://github.com/thunlp/OpenDelta.") delta_models = { "bitfit": BitFitModel, "adapter": AdapterModel, "prefix": PrefixModel, "lora": LoraModel, "softprompt": SoftPromptModel, } return delta_models[model_type] def parse_delta_kwargs( config: transformers.PretrainedConfig, delta_kwargs: Dict[str, Any], num_layers_unfrozen: int = -1, ) -> Tuple[str, Dict[str, Any]]: """Parses through delta kwargs to get delta type and proper modified modules.""" # This function is needed to parse through the `delta_kwargs` in order to: # 1) Get the `delta_type` method name to access the correct `delta_model_class` # 2a) Accept user specified `modified_modules` and if not provided use the `trlx` default mapping # 2b) Convert the list of `modified_modules` to a range of layers that fit within the range # of learnable layers as specified by `num_layers_unfrozen` # Pop `delta_type` to allow passing the kwargs to the model constructor since # `delta_type` is not a valid argument of the constructor delta_type = delta_kwargs.pop("delta_type") assert delta_type in ["lora"], "Only `LoRA` based delta models are supported" # Use `trlx` default modified modules if none are specified modified_modules = delta_kwargs.get("modified_modules", "all") if modified_modules in ["all", "attention", "mlp"]: if config.model_type not in MODIFIED_MODULES_DICT: raise ValueError( f"Model type `{config.model_type}` is not currently supported for " "delta training with default modified modules." ) modified_modules = MODIFIED_MODULES_DICT[config.model_type][modified_modules] # Update the `modified_modules` with the correct layer ranges delta_kwargs["modified_modules"] = get_delta_modified_modules( config, modified_modules, num_layers_unfrozen=num_layers_unfrozen ) return delta_type, delta_kwargs def regex_for_range(min_: int, max_: int) -> str: # noqa """Returns a regex that matches all numbers in the given range. Example: regex_for_range(12, 34) -> "1[2-9]|2\d|3[0-4]" Copyright (c) 2013, Dmitry Voronin. All rights reserved. Reference: https://github.com/voronind/range-regex """ def split_to_patterns(min_, max_): subpatterns = [] start = min_ for stop in split_to_ranges(min_, max_): subpatterns.append(range_to_pattern(start, stop)) start = stop + 1 return subpatterns def split_to_ranges(min_, max_): stops = {max_} nines_count = 1 stop = fill_by_nines(min_, nines_count) while min_ <= stop < max_: stops.add(stop) nines_count += 1 stop = fill_by_nines(min_, nines_count) zeros_count = 1 stop = fill_by_zeros(max_ + 1, zeros_count) - 1 while min_ < stop <= max_: stops.add(stop) zeros_count += 1 stop = fill_by_zeros(max_ + 1, zeros_count) - 1 stops = list(stops) stops.sort() return stops def fill_by_nines(integer, nines_count): return int(str(integer)[:-nines_count] + "9" * nines_count) def fill_by_zeros(integer, zeros_count): return integer - integer % 10**zeros_count def range_to_pattern(start, stop): pattern = "" any_digit_count = 0 for start_digit, stop_digit in zip(str(start), str(stop)): if start_digit == stop_digit: pattern += start_digit elif start_digit != "0" or stop_digit != "9": pattern += "[{}-{}]".format(start_digit, stop_digit) else: any_digit_count += 1 if any_digit_count: pattern += r"\d" if any_digit_count > 1: pattern += "{{{}}}".format(any_digit_count) return pattern positive_subpatterns = [] negative_subpatterns = [] if min_ < 0: min__ = 1 if max_ < 0: min__ = abs(max_) max__ = abs(min_) negative_subpatterns = split_to_patterns(min__, max__) min_ = 0 if max_ >= 0: positive_subpatterns = split_to_patterns(min_, max_) negative_only_subpatterns = ["-" + val for val in negative_subpatterns if val not in positive_subpatterns] positive_only_subpatterns = [val for val in positive_subpatterns if val not in negative_subpatterns] intersected_subpatterns = ["-?" + val for val in negative_subpatterns if val in positive_subpatterns] subpatterns = negative_only_subpatterns + intersected_subpatterns + positive_only_subpatterns return "|".join(subpatterns) def get_optimizer_grouped_parameters(model, weight_decay, no_decay_name_list=["bias", "LayerNorm.weight"]): optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if (not any(nd in n for nd in no_decay_name_list) and p.requires_grad) ], "weight_decay": weight_decay, }, { "params": [ p for n, p in model.named_parameters() if (any(nd in n for nd in no_decay_name_list) and p.requires_grad) ], "weight_decay": 0.0, }, ] return optimizer_grouped_parameters def get_all_reduce_mean(tensor): torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) tensor = tensor / torch.distributed.get_world_size() return tensor def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0): zero_stage_3 = (zero_stage == 3) with torch.no_grad(): for param, param_ema in zip(model.parameters(), model_ema.parameters()): # TODO: use prefiltering for efficiency params_to_fetch = _z3_params_to_fetch([param, param_ema ]) if zero_stage_3 else [] should_gather_param = len(params_to_fetch) > 0 with deepspeed.zero.GatheredParameters( params_to_fetch, enabled=should_gather_param): data = param.data if device is not None: data = data.to(device) param_ema.data.copy_(torch.lerp(data, param_ema.data, beta)) def save_hf_format(model, tokenizer, args, sub_folder=""): # used to save huggingface format, so we can use it for hf.from_pretrained model_to_save = model.module if hasattr(model, 'module') else model CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" output_dir = os.path.join(args.output_dir, sub_folder) try: os.makedirs(output_dir) except FileExistsError: pass output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) save_dict = model_to_save.state_dict() for key in list(save_dict.keys()): if "lora" in key: del save_dict[key] torch.save(save_dict, output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_dir) def _z3_params_to_fetch(param_list): return [ p for p in param_list if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE ] def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0): zero_stage_3 = (zero_stage == 3) os.makedirs(save_dir, exist_ok=True) WEIGHTS_NAME = "pytorch_model.bin" output_model_file = os.path.join(save_dir, WEIGHTS_NAME) model_to_save = model_ema.module if hasattr(model_ema, 'module') else model_ema if not zero_stage_3: if global_rank == 0: torch.save(model_to_save.state_dict(), output_model_file) else: output_state_dict = {} for k, v in model_to_save.named_parameters(): if hasattr(v, 'ds_id'): with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v]), enabled=zero_stage_3): v_p = v.data#.cpu() else: v_p = v#.cpu() if global_rank == 0 and "lora" not in k: output_state_dict[k] = v_p if global_rank == 0: torch.save(output_state_dict, output_model_file) del output_state_dict def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]: ordering_and_checkpoint_path = [] glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)] for path in glob_checkpoints: if use_mtime: ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) else: regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path) if regex_match is not None and regex_match.groups() is not None: ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) checkpoints_sorted = sorted(ordering_and_checkpoint_path) checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] return checkpoints_sorted def rotate_checkpoints(save_total_limit, use_mtime=False, output_dir=None, best_model_checkpoint=None) -> None: if save_total_limit is None or save_total_limit <= 0: return # Check if we should delete older checkpoint(s) checkpoints_sorted = sorted_checkpoints(use_mtime=use_mtime, output_dir=output_dir) if len(checkpoints_sorted) <= save_total_limit: return # If save_total_limit=1 with load_best_model_at_end=True, we could end up deleting the last checkpoint, which # we don't do to allow resuming. save_total_limit_tmp = save_total_limit if ( best_model_checkpoint is not None and save_total_limit == 1 and checkpoints_sorted[-1] != best_model_checkpoint ): save_total_limit_tmp = 2 number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit_tmp) checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] for checkpoint in checkpoints_to_be_deleted: print(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") shutil.rmtree(checkpoint, ignore_errors=True) def qwen_make_context( tokenizer: PreTrainedTokenizer, query: str, history: List[Tuple[str, str]] = None, system: str = "", max_window_size: int = 6144, chat_format: str = "chatml", ): if history is None: history = [] if chat_format == "chatml": im_start, im_end = "<|im_start|>", "<|im_end|>" def _tokenize_str(content_1, content_2=None, add_special_tokens=True, truncation=False, max_length=None): # if content_2 is not None: # content = f"{im_start}{content_1}{im_end}\n{im_start}{content_2}{im_end}\n" # else: # content = f"{im_start}{content_1}{im_end}\n" token_ids = tokenizer.encode(content_1, content_2, add_special_tokens=add_special_tokens, max_length=max_length, truncation=truncation) content = tokenizer.decode(token_ids) return content, token_ids # system message system_text, system_tokens = _tokenize_str(f"system\n{system}") # current-turn user query max_query_length = max_window_size - len(system_tokens) + 2 assert max_query_length > 0, f"System message length ({len(system_tokens)}) has exceeded max window size ({max_window_size})" query_text, query_tokens = _tokenize_str(f"user\n{query}", f"assistant\n", truncation=True, max_length=max_query_length) # remove additional "<|im_end|>" and "\n" query_text = query_text.strip("\n").strip(im_end) query_tokens = query_tokens[:-2] # history raw_text = "" context_tokens = [] for turn_query, turn_response in reversed(history): prev_chat, next_context_tokens = _tokenize_str(f"user\n{turn_query}", f"assistant\n{turn_response}") current_context_size = ( len(system_tokens) + len(query_tokens) + len(next_context_tokens) + len(context_tokens) ) if current_context_size < max_window_size: context_tokens = next_context_tokens + context_tokens raw_text = prev_chat + raw_text else: break raw_text = system_text + raw_text + query_text context_tokens = system_tokens + context_tokens + query_tokens elif chat_format == "raw": raw_text = query context_tokens = tokenizer.encode(raw_text) else: raise NotImplementedError(f"Unknown chat format {chat_format!r}") return raw_text, context_tokens ================================================ FILE: src/utils/nlp_utils.py ================================================ import re CLEAN_TEXT_PATTERN = re.compile(r"[\r\n]") def clean_text(text): if not isinstance(text, str): return "" return CLEAN_TEXT_PATTERN.sub("", text)