Repository: FederatedAI/FATE-LLM Branch: main Commit: 0c63377e468f Files: 172 Total size: 765.2 KB Directory structure: gitextract_qkv2xwam/ ├── LICENSE ├── README.md ├── RELEASE.md ├── doc/ │ ├── fate_llm_evaluate.md │ ├── standalone_deploy.md │ └── tutorial/ │ ├── fdkt/ │ │ ├── README.md │ │ └── fdkt.ipynb │ ├── fedcot/ │ │ ├── README.md │ │ ├── encoder_decoder_tutorial.ipynb │ │ └── fedcot_tutorial.ipynb │ ├── fedkseed/ │ │ ├── README.md │ │ └── fedkseed-example.ipynb │ ├── fedmkt/ │ │ ├── README.md │ │ └── fedmkt.ipynb │ ├── inferdpt/ │ │ └── inferdpt_tutorial.ipynb │ ├── offsite_tuning/ │ │ ├── Offsite_tuning_tutorial.ipynb │ │ └── README.md │ └── pellm/ │ ├── ChatGLM3-6B_ds.ipynb │ └── builtin_pellm_models.md ├── examples/ │ ├── fedmkt/ │ │ ├── __init__.py │ │ ├── fedmkt.py │ │ ├── fedmkt_config.yaml │ │ └── test_fedmkt_llmsuit.yaml │ ├── offsite_tuning/ │ │ ├── __init__.py │ │ ├── offsite_tuning.py │ │ ├── offsite_tuning_config.yaml │ │ └── test_offsite_tuning_llmsuite.yaml │ └── pellm/ │ ├── __init__.py │ ├── bloom_lora_config.yaml │ ├── test_bloom_lora.py │ └── test_pellm_llmsuite.yaml └── python/ ├── MANIFEST.in ├── fate_llm/ │ ├── __init__.py │ ├── algo/ │ │ ├── __init__.py │ │ ├── dp/ │ │ │ ├── __init__.py │ │ │ ├── dp_trainer.py │ │ │ └── opacus_compatibility/ │ │ │ ├── __init__.py │ │ │ ├── grad_sample/ │ │ │ │ ├── __init__.py │ │ │ │ └── embedding.py │ │ │ ├── optimizers/ │ │ │ │ ├── __init__.py │ │ │ │ └── optimizer.py │ │ │ └── transformers_compate.py │ │ ├── fdkt/ │ │ │ ├── __init__.py │ │ │ ├── cluster/ │ │ │ │ ├── __init__.py │ │ │ │ ├── cluster.py │ │ │ │ └── cluster_method.py │ │ │ ├── fdkt_data_aug.py │ │ │ ├── inference_inst.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── dp_loss.py │ │ │ ├── invalid_data_filter.py │ │ │ └── text_generate.py │ │ ├── fedavg/ │ │ │ ├── __init__.py │ │ │ └── fedavg.py │ │ ├── fedcollm/ │ │ │ ├── __init__.py │ │ │ ├── fedcollm.py │ │ │ ├── fedcollm_trainer.py │ │ │ └── fedcollm_training_args.py │ │ ├── fedcot/ │ │ │ ├── __init__.py │ │ │ ├── encoder_decoder/ │ │ │ │ ├── __init__.py │ │ │ │ ├── init/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── default_init.py │ │ │ │ └── slm_encoder_decoder.py │ │ │ ├── fedcot_trainer.py │ │ │ └── slm_encoder_decoder_trainer.py │ │ ├── fedkseed/ │ │ │ ├── __init__.py │ │ │ ├── args.py │ │ │ ├── fedkseed.py │ │ │ ├── optimizer.py │ │ │ ├── pytorch_utils.py │ │ │ ├── trainer.py │ │ │ └── zo_utils.py │ │ ├── fedmkt/ │ │ │ ├── __init__.py │ │ │ ├── fedmkt.py │ │ │ ├── fedmkt_data_collator.py │ │ │ ├── fedmkt_trainer.py │ │ │ ├── token_alignment/ │ │ │ │ ├── __init__.py │ │ │ │ ├── spectal_token_mapping.py │ │ │ │ ├── token_align.py │ │ │ │ └── vocab_mapping.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── dataset_sync_util.py │ │ │ ├── generate_logit_utils.py │ │ │ ├── tokenizer_tool.py │ │ │ └── vars_define.py │ │ ├── inferdpt/ │ │ │ ├── __init__.py │ │ │ ├── _encode_decode.py │ │ │ ├── inferdpt.py │ │ │ ├── init/ │ │ │ │ ├── _init.py │ │ │ │ └── default_init.py │ │ │ └── utils.py │ │ ├── offsite_tuning/ │ │ │ ├── __init__.py │ │ │ └── offsite_tuning.py │ │ └── ppc-gpt/ │ │ └── __init__.py │ ├── data/ │ │ ├── __init__.py │ │ ├── data_collator/ │ │ │ ├── __init__.py │ │ │ ├── cust_data_collator.py │ │ │ └── fedcot_collator.py │ │ └── tokenizers/ │ │ ├── __init__.py │ │ └── cust_tokenizer.py │ ├── dataset/ │ │ ├── __init__.py │ │ ├── data_config/ │ │ │ ├── __init__.py │ │ │ ├── default_ag_news.yaml │ │ │ └── default_yelp_review.yaml │ │ ├── fedcot_dataset.py │ │ ├── flex_dataset.py │ │ ├── hf_dataset.py │ │ ├── input_output_dataset.py │ │ ├── prompt_dataset.py │ │ ├── qa_dataset.py │ │ └── seq_cls_dataset.py │ ├── evaluate/ │ │ ├── __init__.py │ │ ├── scripts/ │ │ │ ├── __init__.py │ │ │ ├── _options.py │ │ │ ├── config_cli.py │ │ │ ├── data_cli.py │ │ │ ├── eval_cli.py │ │ │ └── fate_llm_cli.py │ │ ├── tasks/ │ │ │ ├── __init__.py │ │ │ ├── advertise_gen/ │ │ │ │ ├── __init__.py │ │ │ │ ├── advertise_utils.py │ │ │ │ └── default_advertise_gen.yaml │ │ │ └── dolly_15k/ │ │ │ ├── __init__.py │ │ │ ├── default_dolly_15k.yaml │ │ │ └── dolly_utils.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── _io.py │ │ ├── _parser.py │ │ ├── config.py │ │ ├── data_tools.py │ │ ├── llm_evaluator.py │ │ └── model_tools.py │ ├── inference/ │ │ ├── __init__.py │ │ ├── api.py │ │ ├── hf_qw.py │ │ ├── inference_base.py │ │ └── vllm.py │ ├── model_zoo/ │ │ ├── __init__.py │ │ ├── embedding_transformer/ │ │ │ ├── __init__.py │ │ │ └── st_model.py │ │ ├── hf_model.py │ │ ├── offsite_tuning/ │ │ │ ├── __init__.py │ │ │ ├── bloom.py │ │ │ ├── gpt2.py │ │ │ ├── llama.py │ │ │ └── offsite_tuning_model.py │ │ └── pellm/ │ │ ├── __init__.py │ │ ├── albert.py │ │ ├── bart.py │ │ ├── bert.py │ │ ├── bloom.py │ │ ├── chatglm.py │ │ ├── deberta.py │ │ ├── distilbert.py │ │ ├── gpt2.py │ │ ├── llama.py │ │ ├── opt.py │ │ ├── parameter_efficient_llm.py │ │ ├── qwen.py │ │ └── roberta.py │ ├── runner/ │ │ ├── __init__.py │ │ ├── fdkt_runner.py │ │ ├── fedcot_runner.py │ │ ├── fedkseed_runner.py │ │ ├── fedmkt_runner.py │ │ ├── homo_seq2seq_runner.py │ │ ├── inferdpt_runner.py │ │ └── offsite_tuning_runner.py │ └── trainer/ │ ├── __init__.py │ └── seq2seq_trainer.py ├── requirements.txt └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # FATE-LLM FATE-LLM is a framework to support federated learning for large language models(LLMs) and small language models(SLMs).
## Design Principle - Federated learning for large language models(LLMs) and small language models(SLMs). - Promote training efficiency of federated LLMs using Parameter-Efficient methods. - Protect the IP of LLMs using FedIPR. - Protect data privacy during training and inference through privacy preserving mechanisms.
### Standalone deployment * To deploy FATE-LLM v2.2.0 or higher version, three ways are provided, please refer [deploy tutorial](./doc/standalone_deploy.md) for more details: * deploy with FATE only from pypi then using Launcher to run tasks * deploy with FATE、FATE-Flow、FATE-Client from pypi, user can run tasks with Pipeline * To deploy lower versions: please refer to [FATE-Standalone deployment](https://github.com/FederatedAI/FATE#standalone-deployment). * To deploy FATE-LLM v2.0.* - FATE-LLM v2.1.*, deploy FATE-Standalone with version >= 2.1, then make a new directory `{fate_install}/fate_llm` and clone the code into it, install the python requirements, and add `{fate_install}/fate_llm/python` to `PYTHONPATH` * To deploy FATE-LLM v1.x, deploy FATE-Standalone with 1.11.3 <= version < 2.0, then copy directory `python/fate_llm` to `{fate_install}/fate/python/fate_llm` ### Cluster deployment Use [FATE-LLM deployment packages](https://github.com/FederatedAI/FATE/wiki/Download#llm%E9%83%A8%E7%BD%B2%E5%8C%85) to deploy, refer to [FATE-Cluster deployment](https://github.com/FederatedAI/FATE#cluster-deployment) for more deployment details. ## Quick Start - [Federated ChatGLM3-6B Training](doc/tutorial/pellm/ChatGLM3-6B_ds.ipynb) - [Builtin Models In PELLM](doc/tutorial/pellm/builtin_pellm_models.md) - [FedMKT: Federated Mutual Knowledge Transfer for Large and Small Language Models](./doc/tutorial/fedmkt/) - [FedCoT: Federated Chain-of-Thought Distillation for Large Language Models](./doc/tutorial/fedcot) - [PPC-GPT: Federated Task-Specific Compression of Large Language Models via Pruning and Chain-of-Thought Distillation](https://aclanthology.org/2025.emnlp-main.747.pdf) - [FDKT: Federated Domain-Specific Knowledge Transfer on Large Language Models Using Synthetic Data](./doc/tutorial/fdkt) - [Offsite Tuning: Transfer Learning without Full Model](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb) - [FedKSeed: Federated Full-Parameter Tuning of Billion-Sized Language Models with Communication Cost under 18 Kilobytes](./doc/tutorial/fedkseed/) - [InferDPT: Privacy-preserving Inference for Black-box Large Language Models](./doc/tutorial/inferdpt/inferdpt_tutorial.ipynb) ## FATE-LLM Evaluate - [Python SDK & CLI Usage Guide](./doc/fate_llm_evaluate.md) ## Citation If you publish work that uses FATE-LLM, please cite FATE-LLM as follows: ``` @article{fan2023fate, title={Fate-llm: A industrial grade federated learning framework for large language models}, author={Fan, Tao and Kang, Yan and Ma, Guoqiang and Chen, Weijing and Wei, Wenbin and Fan, Lixin and Yang, Qiang}, journal={Symposium on Advances and Open Problems in Large Language Models (LLM@IJCAI'23)}, year={2023} } ``` ================================================ FILE: RELEASE.md ================================================ ## Release 2.2.0 ### Major Features and Improvements * Integrate the FedCoT (Federated Chain-of-Thought) algorithm, a novel framework that enhances local small language models (SLMs) using differentially private protected Chain of Thoughts (Cot) generated by remote LLMs: * Implement InferDPT for privacy-preserving Cot generation. * Support an encoder-decoder mechanism for privacy-preserving Cot generation. * Add prefix trainers for step-by-step distillation and text encoder-decoder training. * Integrate the FDKT algorithm, a framework that enables domain-specific knowledge transfer from LLMs to SLMs while preserving SLM data privacy * Deployment Optimization: support installation of FATE-LLM by PyPi ## Release 2.1.0 ### Major Features and Improvements * New FedMKT Federated Tuning Algorithms: Federated Mutual Knowledge Transfer for Large and Small Language Models * Support three distinct scenarios: Heterogeneous, Homogeneous and One-to-One * Support LLM to SLM one-way knowledge transfer * Introduce the InferDPT algorithm, which leverages differential privacy (DP) to facilitate privacy-preserving inference for large language models. * Introduce FATE-LLM Evaluate: evaluate FATE-LLM models in few lines with Python SDK or simple CLI commands(`fate_llm evaluate`), built-in cases included ## Release 2.0.0 ### Major Features and Improvements * Adapt to fate-v2.0 framework: * Migrate parameter-efficient fine-tuning training methods and models. * Migrate Standard Offsite-Tuning and Extended Offsite-Tuning(Federated Offsite-Tuning+) * Newly trainer,dataset, data_processing function design * New FedKSeed Federated Tuning Algorithm: train large language models in a federated learning setting with extremely low communication cost ## Release 1.3.0 ### Major Features and Improvements * FTL-LLM(Fedrated Learning + Transfer Learning + LLM) * Standard Offsite-Tuning and Extended Offsite-Tuning(Federated Offsite-Tuning+)now supported * Framework available for Emulator and Adapter development * New Offsite-Tuning Trainer introduced * Includes built-in models such as GPT-2 family, Llama7b, and Bloom family * FedIPR * Introduced WatermarkDataset as the foundational dataset class for backdoor-based watermarks * Added SignConv and SignLayerNorm blocks for feature-based watermark models * New FedIPR Trainer available * Built-in models with feature-based watermarks include Alexnet, Resnet18, DistilBert, and GPT2 * More models support parameter-efficient fine-tuning: ChatGLM2-6B and Bloom-7B1 ## Release 1.2.0 ### Major Features and Improvements * Support Federated Training of LLaMA-7B with parameter-efficient fine-tuning. ## Release 1.1.0 ### Major Features and Improvements * Support Federated Training of ChatGLM-6B with parameter-efficient fine-tuning adapters: like Lora and P-Tuning V2 etc. * Integration of `peft`, which support many parameter-efficient adapters. ================================================ FILE: doc/fate_llm_evaluate.md ================================================ ## FATE-LLM Python SDK FATE-LLM Python SDK provides simple API for evaluating large language models. Built on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/), our evaluation tool may be used on pre-trained models from Huggingface, local-built models, as well as FATE-LLM models. [Built-in datasets](#built-in-tasks) currently include Dolly-15k and Advertise Generation. Below shows how to evaluate given llm model in few lines. For quick single-model evaluation, below steps should suffice, however, if comparative evaluation among multiple models is desired, CLI is recommended. ```python from lm_eval.models.huggingface import HFLM from fate_llm.evaluate.utils import llm_evaluator # download data for built-in tasks if running fate-llm evaluation for the first time # alternatively, use CLI `fate-llm data download` to download data llm_evaluator.download_task("dolly-15k") # set paths of built-in tasks llm_evaluator.init_tasks() # load model bloom_lm = HFLM(pretrained='bloom-560') # if loading local model, specify peft storage location # gpt2_lm = HFLM(pretrained='bloom-560m', peft_path_format="path/to/peft") # run evaluation llm_evaluator.evaluate(model=bloom_lm, tasks="dolly-15k", show_result=True) ``` When network allows, or if already cached, tasks from lm-evaluation may be provided for evaluation in similar style. ```python from lm_eval.models.huggingface import HFLM from fate_llm.evaluate.utils import llm_evaluator # load model bloom_lm = HFLM(pretrained='bloom-560') # if loading local model, specify peft storage location # bloom_lm = HFLM(pretrained='bloom-560m', peft_path_format="path/to/peft") # run evaluation llm_evaluator.evaluate(model=gpt2_lm, tasks="ceval", show_result=True) ``` ## FATE-LLM Command Line Interface FATE LLM provides built-in tasks for comparing evaluation results of different llm models. Alternatively, user may provide arbitrary tasks for evaluation. ### install ```bash cd {path_to_fate_llm}/python pip install -e . ``` ### command options ```bash fate_llm --help ``` #### evaluate: 1. in: ```bash fate_llm evaluate -i ``` will run llm at *path1* 2. eval-config: ```bash fate_llm evaluate -i -c ``` will run llm testsuites in *path1* with evaluation configuration set to *path2* 3. result-output: ```bash fate_llm evaluate -i -o ``` will run llm testsuites in *path1* with evaluation result output stored in *path2* ### config ```bash fate_llm config --help ``` 1. new: ```bash fate_llm config new ``` will create a new evaluation configuration file in current directory 2. show: ```bash fate_llm config show ``` will show current evaluation configuration 3. edit: ```bash fate_llm config edit ``` will edit evaluation configuration ### data ```bash fate_llm data --help ``` 1. download: ```bash fate_llm data download -t -t ... ``` will download corresponding data for given tasks ### FATE-LLM Eval job configuration Configuration of jobs should be specified in a yaml file. A FATE-LLM testsuite includes the following elements: - job group: each group includes arbitrary number of jobs with paths to corresponding script and configuration - job: name of evaluation job to be run, must be unique within each group list - pretrained: path to pretrained model, should be either mmodel name from Hugginface or relative path to testsuite - peft: path to peft file, should be relative to testsuite, optional - tasks: list of tasks to be evaluated, optional for jobs skipping evaluation - include_path: should be specified if tasks are user-defined - eval_conf: path to evaluation configuration file, should be relative to testsuite; if not provided, will use default conf ```yaml bloom_lora: pretrained: "bloom-560m" peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory" tasks: - "dolly-15k" ``` - llm suite ```yaml bloom_suite: bloom_zero_shot: pretrained: "bloom-560m" tasks: - "dolly-15k" ``` ## Built-in Tasks Currently, we include the following tasks in FATE-LLM Evaluate: | Task Name | Alias | Task Type | Metric | source | |:---------:|:-------------:|:----------:|:-------:|:-------------------------------------------------------------------------:| | Dolly-15k | dolly-15k | generation | rouge-L | [link](https://huggingface.co/datasets/databricks/databricks-dolly-15k) | | ADGEN | advertise-gen | generation | rouge-L | [link](https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/README_en.md#instructions) | Use corresponding alias to reference tasks in the system. ================================================ FILE: doc/standalone_deploy.md ================================================ # FATE-LLM Single-Node Deployment Guide ## 1. Introduction **Server Configuration:** - **Quantity:** 1 - **Configuration:** 8 cores / 16GB memory / 500GB hard disk / GPU Machine - **Operating System:** CentOS Linux release 7 - **User:** User: app owner:apps The single-node version provides 3 deployment methods, which can be selected based on your needs: - Install FATE-LLM from PyPI With FATE - Install FATE-LLM from PyPI with FATE, FATE-Flow, FATE-Client ## 2. Install FATE-LLM from PyPI With FATE In this way, user can run tasks with Launcher, a convenient way for fast experimental using. ### 2.1 Installing Python Environment - Prepare and install [conda](https://docs.conda.io/projects/miniconda/en/latest/) environment. - Create a virtual environment: ```shell # FATE-LLM requires Python >= 3.10 conda create -n fate_env python=3.10 conda activate fate_env ``` ### 2.2 Installing FATE-LLM This section introduces how to install FATE-LLM from pypi with FATE, execute the following command to install FATE-LLM. ```shell pip install fate_llm[fate]==2.2.0 ``` ### 2.3 Usage After installing successfully, please refer to [tutorials](../README.md#quick-start) to run tasks, tasks describe in the tutorials running will Launcher are all supported. ## 3. Install FATE-LLM from PyPI with FATE, FATE-Flow, FATE-Client In this way, user can run tasks with Pipeline or Launcher. ### 3.1 Installing Python Environment Please refer to section-2.1 ### 3.2 Installing FATE-LLM with FATE, FATE-Flow, FATE-Client ```shell pip install fate_client[fate,fate_flow,fate_client]==2.2.0 ``` ### 3.3 Service Initialization ```shell mkdir fate_workspace fate_flow init --ip 127.0.0.1 --port 9380 --home $(pwd)/fate_workspace pipeline init --ip 127.0.0.1 --port 9380 ``` - `ip`: The IP address where the service runs. - `port`: The HTTP port the service runs on. - `home`: The data storage directory, including data, models, logs, job configurations, and SQLite databases. ### 3.4 Start Fate-Flow Service ```shell fate_flow start fate_flow status # make sure fate_flow service is started ``` FATE-Flow also provides other instructions like stop and restart, use only if users want to stop/restart fate_flow services. ```shell # Warning: normal installing process does not need to execute stop/restart instructions. fate_flow stop fate_flow restart ``` ### 3.5 Usage Please refer to [tutorials](../README.md#quick-start) for more usage guides, tasks describe in the tutorials running will Pipeline or Launcher are all supported. ================================================ FILE: doc/tutorial/fdkt/README.md ================================================ # FATE-LLM: FDKT The algorithm is based on paper [Federated Domain-Specific Knowledge Transfer on Large Language Models Using Synthetic Data](https://arxiv.org/pdf/2405.14212), a novel framework that enables domain-specific knowledge transfer from LLMs to SLMs while preserving SLM data privacy. ## Citation If you publish work that uses FDKT, please cite FDKT as follows: ``` @article{li2024federated, title={Federated Domain-Specific Knowledge Transfer on Large Language Models Using Synthetic Data}, author={Li, Haoran and Zhao, Xinyuan and Guo, Dadi and Gu, Hanlin and Zeng, Ziqian and Han, Yuxing and Song, Yangqiu and Fan, Lixin and Yang, Qiang}, journal={arXiv preprint arXiv:2405.14212}, year={2024} } ``` ================================================ FILE: doc/tutorial/fdkt/fdkt.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Synthesize Data With FDKT" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this tutoria, we will demonstrate how to Synthesize data using the FATE-LLM framework. In FATE-LLM, we introduce the \"FDKT\" module, specifically designed for domain-specific knowledge transfer on large language models using synthetic data. FDKT Algorithm is based on paper [Federated Domain-Specific Knowledge Transfer on\n", "Large Language Models Using Synthetic Data](https://arxiv.org/pdf/2405.14212), We integrate its code into the FATE-LLM framework. \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset: Yelp\n", "We processed and sample data of 'Health' subdomain from [Yelp dataset](https://arxiv.org/abs/1509.01626) , the dataset can be downloaded from [here](https://www.yelp.com/dataset). \n", "Once the dataset has been downloaded, execute the following command to untar the downloaded dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "```shell\n", "tar -xvf yelp_dataset.tar\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following code will sample 5000 datalines of 'Health' subdomain, and train data will generated under the folder './processed_data/Health/train.json'" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import sys\n", "import random\n", "from pathlib import Path\n", "random.seed(42)\n", "\n", "\n", "base_dir = \"./\"\n", "business_data_path = os.path.join(base_dir, 'yelp_academic_dataset_business.json')\n", "review_data_path = os.path.join(base_dir, 'yelp_academic_dataset_review.json')\n", "\n", "business_data_file = open(business_data_path, 'r')\n", "review_data_file = open(review_data_path, 'r')\n", "\n", "categories_list = ['Restaurants', 'Shopping', 'Arts', 'Health']\n", "business_dic = {}\n", "data_dict = {}\n", "for category in categories_list:\n", " business_dic[category] = set()\n", " data_dict[category] = []\n", "\n", "\n", "def get_categories(categories):\n", " return_list = []\n", " for category in categories_list:\n", " if category in categories:\n", " return_list.append(category)\n", " return return_list\n", "\n", "\n", "for line in business_data_file.readlines():\n", " dic = json.loads(line)\n", " if 'categories' in dic.keys() and dic['categories'] is not None:\n", " category = get_categories(dic['categories'])\n", " if len(category) == 1:\n", " business_dic[category[0]].add(dic['business_id'])\n", "\n", "# for category in categories_list:\n", "for line in review_data_file.readlines():\n", " dic = json.loads(line)\n", " if 'business_id' in dic.keys() and dic['business_id'] is not None:\n", " for category in categories_list:\n", " if dic['business_id'] in business_dic[category]:\n", " if dic['text'] is not None and dic['stars'] is not None:\n", " data_dict[category].append({'text': dic['text'], 'stars': dic['stars']})\n", " break\n", "\n", "train_data_path = os.path.join('processed_data', \"Health\", 'train.json')\n", "os.makedirs(Path(train_data_path).parent, exist_ok=True)\n", "train_data_file = open(train_data_path, 'w')\n", "data_list = data_dict[\"Health\"]\n", "\n", "sample_data_dict = dict()\n", "\n", "for data in data_list:\n", " star = int(data[\"stars\"])\n", " if star not in sample_data_dict:\n", " sample_data_dict[star] = []\n", "\n", " sample_data_dict[star].append(data)\n", "\n", "data_list = []\n", "star_keys = list(sample_data_dict.keys())\n", "for star in star_keys:\n", " sample_data = sample_data_dict[star][:1000]\n", " random.shuffle(sample_data)\n", " data_list.extend(sample_data)\n", "\n", "random.shuffle(data_list)\n", "json.dump(data_list, train_data_file, indent=4)\n", "train_data_file.close()\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Models Use\n", "Please download the following models, these models are used for data augmentation process.\n", "\n", "LLM: [Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat) \n", "SLM: [gpt2-xl](https://huggingface.co/openai-community/gpt2-xl)\n", "\n", "MeanWhile, 'all-mpnet-base-v2' is used to generate embedding vectors in LLM side.\n", "\n", "Embedding Model: [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Running FDKT Data Synthetic Process With Launcher (Experimential Using)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SLM Setting\n", "\n", "In this section, we will introduce some key configurations in SLM side." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 1. loading model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import transformers\n", "from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", "\n", "\n", "slm_pretrained_path = \"gpt2-xl\" # modity this to local directory\n", "slm = transformers.AutoModelForCausalLM.from_pretrained(slm_pretrained_path, torch_dtype=torch.bfloat16)\n", "tokenizer = get_tokenizer(slm_pretrained_path)\n", "tokenizer.pad_token_id = tokenizer.eos_token_id\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2. Initialize SLM Training Arugments" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.fdkt.fdkt_data_aug import FDKTTrainingArguments\n", "\n", "\n", "training_args = FDKTTrainingArguments(\n", " use_cpu=False, # use gpu to do dp(differential privacy) training process\n", " device_id=0, # the device number of gpu\n", " num_train_epochs=1, # dp training epochs\n", " per_device_train_batch_size=2, # batch size of dp training\n", " slm_generation_batch_size=32, # batch_size to generate data in slm side\n", " seq_num_for_single_category=300, # data num for each category(label)\n", " slm_generation_config=dict(\n", " max_new_tokens=256,\n", " temperature=1.0,\n", " top_k=50,\n", " top_p=0.9,\n", " repetition_penalty=1.0,\n", " pad_token_id=tokenizer.eos_token_id\n", " ),\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3. Initlaize DataSet Instance\n", "\n", "We provide default templates for dataset \"Yelp\" and \"AGNews\", user can refer [here](https://github.com/FederatedAI/FATE-LLM/tree/dev-2.2.0/python/fate_llm/dataset/data_config) for more details. If you want to use your own dataset, please provide fields label_key/text_key/augment_format/filter_format/tokenize_format/sub_domain/label_list/few_shot_format/text_with_label_format like the two default templates and passing it as and argument." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fate_llm.dataset.flex_dataset import FlexDataset\n", "\n", "\n", "ds = FlexDataset(\n", " tokenizer_name_or_path=slm_pretrained_path,\n", " load_from=\"json\",\n", " data_part=\"train\",\n", " dataset_name=\"yelp_review\", # use default template\n", " # config=dict/template_path # if dataset_name not equals to \"yelp_review\" or \"ag_news\"\n", " need_preprocess=True,\n", " select_num=2000, # use data_num=2000 to train, default is None, None means using all data\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LLM Setting\n", "\n", "In this section, we will introduce some key configurations in LLM side." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 1. Deploy VLLM Server And Use OpenAI API Protocol To SpeedUp LLM Inference" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "please copy the following code to local file create_and_start_vllm.sh, then run the bash code by executing \"bash create_and_start_vllm.sh\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create_and_start_vllm.sh\n", "# create vllm enviroment\n", "\n", "python -m venv vllm_venv\n", "source vllm_venv/bin/activate\n", "pip install vllm==0.4.3\n", "pip install numpy==1.26.4 # numpy >= 2.0.0 will raise error, so reinstall numpy<2.0.0\n", "\n", "# please modify Qwen1.5-7B-Chat to local llm model saving path\n", "export CUDA_VISIBLE_DEVICES=1,2\n", "nohup python -m vllm.entrypoints.openai.api_server --host 127.0.0.1 --port 9999 --model Qwen1.5-7B-Chat --dtype=half --enforce-eager --api-key demo --device cuda -tp 2 &" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2. Initialize LLM Training Arugments" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.fdkt.fdkt_data_aug import FDKTTrainingArguments\n", "\n", "\n", "training_args = FDKTTrainingArguments(\n", " sample_num_per_cluster=4, # use this to estimate the number of clusters, n_clusters=(len(dataset) + sample_num_per_cluster - 1) // sample_num_per_cluster\n", " filter_prompt_max_length=2**16,\n", " filter_generation_config=dict(\n", " max_tokens=512,\n", " ),\n", " aug_generation_config=dict(\n", " max_tokens=4096,\n", " temperature=0.8,\n", " top_p=0.9,\n", " ),\n", " aug_prompt_num=20000, # prompts use for data augmentation\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3. Initialize Embedding Generated Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fate_llm.model_zoo.embedding_transformer.st_model import SentenceTransformerModel\n", "\n", "\n", "embedding_lm = SentenceTransformerModel(model_name_or_path=\"all-mpnet-base-v2\").load() # modified model_name_or_path to local model saved path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4. Initalize OpenAI Api For Inference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.fdkt.inference_inst import api_init\n", "\n", "\n", "inference_inst = api_init(\n", " api_url=\"http://127.0.0.1:9999/v1/\",\n", " model_name=\"Qwen1.5-7B-Chat\", # modified model_name to local Meta-Llama-3-8B-Instruct saved path\n", " api_key=\"demo\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Complete Code \n", "\n", "Please paste the code in \"run_fdkt_by_launcher.py\" and execute it with the following command. Once the process is finished, augmentation data will be saved in the current directory, whose filename is aug_data_result.json" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "python run_fdkt_by_launcher.py --parties guest:9999 arbiter:10000 --log_level INFO" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "\n", "import torch\n", "from fate.arch import Context\n", "from fate.arch.launchers.multiprocess_launcher import launch\n", "\n", "# please replace the following four variables to local paths\n", "llm_pretrained_path = \"Qwen1.5-7B-Chat\"\n", "embedding_model_path = \"all-mpnet-base-v2\"\n", "slm_pretrained_path = \"gpt2-xl\"\n", "slm_data_path = \"./processed_data/Health/train.json\"\n", "\n", "\n", "def get_optimizer(model, optimizer=\"adam\", lr=1e-4):\n", " if optimizer == \"adam\":\n", " optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)\n", " elif optimizer == \"adamw\":\n", " optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)\n", " else:\n", " raise NotImplementedError(\"Given optimizer type is not supported\")\n", " return optimizer\n", "\n", "\n", "def train_slm(ctx):\n", " import transformers\n", " from fate_llm.algo.fdkt.fdkt_data_aug import (\n", " FDKTSLM,\n", " FDKTTrainingArguments\n", " )\n", " from fate_llm.dataset.flex_dataset import FlexDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers.data import DataCollatorForSeq2Seq\n", "\n", " slm = transformers.AutoModelForCausalLM.from_pretrained(slm_pretrained_path, torch_dtype=torch.bfloat16)\n", " tokenizer = get_tokenizer(slm_pretrained_path)\n", " tokenizer.pad_token_id = tokenizer.eos_token_id\n", " training_args = FDKTTrainingArguments(\n", " use_cpu=False,\n", " device_id=0,\n", " num_train_epochs=1,\n", " per_device_train_batch_size=2,\n", " slm_generation_batch_size=32,\n", " seq_num_for_single_category=2000,\n", " slm_generation_config=dict(\n", " max_new_tokens=256,\n", " do_sample=True,\n", " temperature=1.0,\n", " top_k=50,\n", " top_p=0.9,\n", " repetition_penalty=1.0,\n", " pad_token_id=tokenizer.eos_token_id\n", " ),\n", " # inference_method=\"vllm\",\n", " )\n", "\n", " ds = FlexDataset(\n", " tokenizer_name_or_path=slm_pretrained_path,\n", " load_from=\"json\",\n", " data_part=\"train\",\n", " dataset_name=\"yelp_review\",\n", " need_preprocess=True,\n", " select_num=2000, # use 2000 data to train, default is None, using all data\n", " )\n", " ds.load(slm_data_path)\n", "\n", " fdkt_runner = FDKTSLM(\n", " ctx=ctx,\n", " model=slm,\n", " training_args=training_args,\n", " tokenizer=tokenizer,\n", " train_set=ds,\n", " optimizer=get_optimizer(slm),\n", " data_collator=DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=tokenizer.pad_token_id)\n", " )\n", "\n", " aug_data = fdkt_runner.aug_data()\n", " with open(\"./aug_data_result.json\", \"w\") as fout:\n", " fout.write(json.dumps(aug_data, indent=4))\n", "\n", "\n", "def train_llm(ctx):\n", " from fate_llm.algo.fdkt.fdkt_data_aug import (\n", " FDKTLLM,\n", " FDKTTrainingArguments\n", " )\n", " from fate_llm.model_zoo.embedding_transformer.st_model import SentenceTransformerModel\n", " from fate_llm.dataset.flex_dataset import FlexDataset\n", " from fate_llm.algo.fdkt.inference_inst import api_init, vllm_init\n", "\n", " embedding_lm = SentenceTransformerModel(model_name_or_path=embedding_model_path).load()\n", " training_args = FDKTTrainingArguments(\n", " sample_num_per_cluster=4,\n", " filter_prompt_max_length=2**14,\n", " filter_generation_config=dict(\n", " max_tokens=4096,\n", " ),\n", " use_cpu=False,\n", " aug_generation_config=dict(\n", " max_tokens=4096,\n", " temperature=0.8,\n", " top_p=0.9,\n", " ),\n", " aug_prompt_num=20000,\n", " )\n", "\n", " ds = FlexDataset(\n", " tokenizer_name_or_path=llm_pretrained_path,\n", " load_from=\"json\",\n", " data_part=\"train\",\n", " dataset_name=\"yelp_review\",\n", " need_preprocess=True,\n", " few_shot_num_per_label=1,\n", " )\n", "\n", " inference_inst = api_init(\n", " api_url=\"http://127.0.0.1:9999/v1/\",\n", " model_name=llm_pretrained_path,\n", " api_key=\"demo\"\n", " )\n", "\n", " fdkt_runner = FDKTLLM(\n", " ctx=ctx,\n", " embedding_model=embedding_lm,\n", " training_args=training_args,\n", " dataset=ds,\n", " inference_inst=inference_inst,\n", " )\n", "\n", " fdkt_runner.aug_data()\n", "\n", "\n", "def run(ctx: Context):\n", " if ctx.is_on_arbiter:\n", " train_llm(ctx)\n", " else:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", " train_slm(ctx)\n", "\n", "\n", "if __name__ == \"__main__\":\n", " launch(run)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Running FDKT with Pipeline (Industrial Using)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Please make sure that FATE and FATE-Flow has been deployed, paste the following code to test_fdkt_by_pipeline.py, the execute \"python test_fdkt_by_pipeline.py\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_fdkt_runner\n", "from fate_client.pipeline.components.fate.nn.algo_params import FDKTTrainingArguments\n", "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n", "from fate_client.pipeline import FateFlowPipeline\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from fate_client.pipeline.components.fate.nn.torch import nn, optim\n", "\n", "\n", "guest = '9999'# replace this party id to actual guest party id in your enviroment\n", "arbiter = '9999'# replace this party id to actual arbiter party id in your enviroment\n", "\n", "# please replace the following four variables to local paths\n", "llm_pretrained_path = \"Qwen1.5-7B-Chat\"\n", "embedding_model_path = \"all-mpnet-base-v2/\"\n", "slm_pretrained_path = \"gpt2-xl\"\n", "slm_data_path = \"./processed_data/Health/train.json\" # should be absolute path\n", "\n", "\n", "def get_llm_conf():\n", " embedding_model = LLMModelLoader(\n", " \"embedding_transformer.st_model\",\n", " \"SentenceTransformerModel\",\n", " model_name_or_path=embedding_model_path\n", " )\n", "\n", " dataset = LLMDatasetLoader(\n", " \"flex_dataset\",\n", " \"FlexDataset\",\n", " tokenizer_name_or_path=llm_pretrained_path,\n", " need_preprocess=True,\n", " dataset_name=\"yelp_review\",\n", " data_part=\"train\",\n", " load_from=\"json\",\n", " few_shot_num_per_label=1,\n", " )\n", "\n", " training_args = FDKTTrainingArguments(\n", " sample_num_per_cluster=4,\n", " filter_prompt_max_length=2 ** 14,\n", " filter_generation_config=dict(\n", " max_tokens=4096,\n", " ),\n", " use_cpu=False,\n", " aug_generation_config=dict(\n", " max_tokens=4096,\n", " temperature=0.8,\n", " top_p=0.9,\n", " ),\n", " aug_prompt_num=20000,\n", " )\n", "\n", " inference_inst_conf = dict(\n", " module_name=\"fate_llm.algo.fdkt.inference_inst\",\n", " item_name=\"api_init\",\n", " kwargs=dict(\n", " api_url=\"http://127.0.0.1:9999/v1/\",\n", " model_name=llm_pretrained_path,\n", " api_key=\"demo\"\n", " )\n", " )\n", "\n", " return get_config_of_fdkt_runner(\n", " training_args=training_args,\n", " embedding_model=embedding_model,\n", " dataset=dataset,\n", " inference_inst_conf=inference_inst_conf,\n", " )\n", "\n", "\n", "def get_slm_conf():\n", " slm_model = LLMModelLoader(\n", " \"hf_model\",\n", " \"HFAutoModelForCausalLM\",\n", " pretrained_model_name_or_path=slm_pretrained_path,\n", " torch_dtype=\"bfloat16\",\n", " )\n", "\n", " tokenizer = LLMDataFuncLoader(\n", " \"tokenizers.cust_tokenizer\",\n", " \"get_tokenizer\",\n", " tokenizer_name_or_path=slm_pretrained_path,\n", " pad_token_id=50256\n", " )\n", "\n", " training_args = FDKTTrainingArguments(\n", " use_cpu=False,\n", " device_id=1,\n", " num_train_epochs=1,\n", " per_device_train_batch_size=2,\n", " slm_generation_batch_size=32,\n", " seq_num_for_single_category=2000,\n", " slm_generation_config=dict(\n", " max_new_tokens=256,\n", " do_sample=True,\n", " temperature=1.0,\n", " top_k=50,\n", " top_p=0.9,\n", " repetition_penalty=1.0,\n", " pad_token_id=50256\n", " ),\n", " )\n", "\n", " dataset = LLMDatasetLoader(\n", " \"flex_dataset\",\n", " \"FlexDataset\",\n", " tokenizer_name_or_path=slm_pretrained_path,\n", " need_preprocess=True,\n", " dataset_name=\"yelp_review\",\n", " data_part=\"train\",\n", " load_from=\"json\",\n", " select_num=2000,\n", " few_shot_num_per_label=1,\n", " )\n", "\n", " optimizer = optim.Adam(lr=0.01)\n", "\n", " return get_config_of_fdkt_runner(\n", " model=slm_model,\n", " tokenizer=tokenizer,\n", " training_args=training_args,\n", " dataset=dataset,\n", " optimizer=optimizer,\n", " data_collator=LLMDataFuncLoader(\n", " \"data_collator.cust_data_collator\",\n", " \"get_seq2seq_data_collator\",\n", " label_pad_token_id=50256,\n", " tokenizer_name_or_path=slm_pretrained_path,\n", " pad_token_id=50256,\n", " ),\n", " )\n", "\n", "\n", "pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n", "pipeline.bind_local_path(path=slm_data_path, namespace=\"experiment\", name=\"slm_train\")\n", "\n", "\n", "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest))\n", "reader_0.guest.task_parameters(\n", " namespace=\"experiment\",\n", " name=\"slm_train\"\n", ")\n", "\n", "\n", "homo_nn_0 = HomoNN(\n", " 'homo_nn_0',\n", " train_data=reader_0.outputs[\"output_data\"],\n", " runner_module=\"fdkt_runner\",\n", " runner_class=\"FDKTRunner\",\n", ")\n", "\n", "homo_nn_0.arbiter.task_parameters(\n", " runner_conf=get_llm_conf()\n", ")\n", "\n", "homo_nn_0.guest.task_parameters(\n", " runner_conf=get_slm_conf()\n", ")\n", "\n", "pipeline.add_tasks([reader_0, homo_nn_0])\n", "pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 1}))\n", "\n", "pipeline.compile()\n", "pipeline.fit()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: doc/tutorial/fedcot/README.md ================================================ # FATE-LLM: FedCoT The algorithm is based on paper ["FedCoT: Federated Chain-of-Thought Distillation for Large Language Models"](https://aclanthology.org/anthology-files/anthology-files/pdf/findings/2025.findings-emnlp.454.pdf), We integrate its code into the FATE-LLM framework. ## Citation If you publish work that uses FedMKT, please cite FedCoT as follows: ``` @inproceedings{fan2025fedcot, title={FedCoT: Federated Chain-of-Thought Distillation for Large Language Models}, author={Fan, Tao and Chen, Weijing and Kang, Yan and Ma, Guoqiang and Gu, Hanlin and Song, Yuanfeng and Fan, Lixin and Yang, Qiang}, booktitle={Findings of the Association for Computational Linguistics: EMNLP 2025}, pages={8546--8557}, year={2025} } ``` ================================================ FILE: doc/tutorial/fedcot/encoder_decoder_tutorial.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "a163d9c2-f9d6-4c61-a8e8-76a3f66c38ae", "metadata": {}, "source": [ "# FedCoT - Train a SLM Encoder Decoder" ] }, { "cell_type": "markdown", "id": "f2b56772-26d5-44fe-9c51-7bc662478b98", "metadata": {}, "source": [ "FedCoT is an innovative framework designed to distill knowledge from large language models (LLMs) to small language models (SLMs) while ensuring data privacy. This method involves a strategy that trains a small language model (SLM) to learn from perturbed and recovered texts. The SLM can then encode raw text, produce results similar to differential privacy mechanisms, and return higher quality recovered text.\n", "\n", "In this tutorial, we will introduce how to train an SLM using the built-in trainer." ] }, { "cell_type": "markdown", "id": "62c6d18a-cc91-4cf5-9cfd-0f97095f7041", "metadata": {}, "source": [ "## Prepare Data\n", "\n", "Several steps need to be done to prepare data for training a SLM encoder-decoder model:\n", "- Sample data from original dataset(For example 50%)\n", "- Organize raw text and get a direct rationale reply from a remote LLM\n", "- Perturb doc using InferDPTKit to get perturbed docs\n", "- Get perturbed replies from a remote LLM\n", "- Organize training data\n", "\n", "### Sample data\n", "Here we will use the arc-easy data as an example, and take first 50% of the original dataset\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "40cc1bb8-a17c-4abc-9279-0849e98ca116", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, load_from_disk\n", "ds = load_dataset('arc_easy')['train']\n", "ds = [ds[i] for i in range(len(ds)//2)]" ] }, { "cell_type": "markdown", "id": "0caff897-5b2b-4409-8601-10f973133b10", "metadata": {}, "source": [ "### Get Direct Replies from A Remote LLM\n", "\n", "We use the inference class to create an API for remote LLMs, or you can implement this part on your own." ] }, { "cell_type": "code", "execution_count": 42, "id": "cf128b46-dea2-4eb4-bf31-568e56b9b78e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] } ], "source": [ "from fate_llm.inference.api import APICompletionInference\n", "from jinja2 import Template\n", "from transformers import AutoTokenizer\n", "\n", "# We are using a Qwen 14B model as the remote model\n", "# You can change the setting\n", "api = APICompletionInference(\n", " api_url='http://172.21.140.2:8081/v1',\n", " api_key='EMPTY',\n", " model_name='/data/cephfs/llm/models/Qwen1.5-14B-Chat'\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('/data/cephfs/llm/models/Qwen1.5-0.5B-Chat/')\n", "\n", "arc_e_template_r = \"\"\"Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\n", "\n", "Example(s):\n", "Question:Which factor will most likely cause a person to develop a fever?\n", "Choices:['a leg muscle relaxing after exercise', 'a bacterial population in the bloodstream', 'several viral particles on the skin', 'carbohydrates being digested in the stomach']\n", "Rationale:A bacterial infection in the bloodstream triggers the immune system to respond, therefore often causing a fever as the body tries to fight off the bacteria. Therefore, the answer is 'a bacterial population in the bloodstream'\n", "\n", "Please explain:\n", "Question:{{question}}\n", "Choices:{{choices.text}}\n", "Rationale:\n", "\"\"\"\n", "\n", "template = Template(arc_e_template_r)\n", "docs_to_infer = [tokenizer.apply_chat_template([{'role':'system', 'content': 'you are a helpful assistant'}, {'role':'user', 'content': template.render(i)}], add_generation_prompt=True, tokenize=False) for i in ds]\n", "results = api.inference(docs_to_infer, {\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", "})\n", "\n", "for i, r in zip(ds, results):\n", " i['rationale'] = r" ] }, { "cell_type": "code", "execution_count": 6, "id": "212822ab-9f64-49a2-bb95-ef8ee2de8e49", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A fever is a response to an infection, typically caused by bacteria or viruses. So, the answer is 'a bacterial population in the bloodstream' because it indicates an immune response to a foreign invader. 'Several viral particles on the skin' could also lead to a fever if they enter the body, but bloodstream presence is more direct. The other choices are unrelated to fever development.\n" ] } ], "source": [ "print(results[0])" ] }, { "cell_type": "markdown", "id": "0f6a0039-1530-4b87-a098-fd2eb01805c2", "metadata": {}, "source": [ "### Perturb Docs & Replies\n", "\n", "You can refer to the InferDPT tutorial for guidance on using the InferDPTKit to generate perturbed documents: [InferDPT Document](./)\n", "We can produce perturbed doc using InferDPTKit:" ] }, { "cell_type": "code", "execution_count": 8, "id": "39249747-bfaa-43bf-8b66-896568941ab8", "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.inferdpt.utils import InferDPTKit\n", "path_to_kit = '/data/projects/inferdpt/test_fate_llm/'\n", "kit = InferDPTKit.load_from_path(path_to_kit)" ] }, { "cell_type": "code", "execution_count": 22, "id": "39b9cefa-dfdb-4bac-b313-4ca3bc118aee", "metadata": {}, "outputs": [], "source": [ "import copy\n", "tmp_ds = copy.deepcopy(ds)\n", "\n", "q_doc = [kit.perturb(i, epsilon=1.0) for i in [Template(\"\"\"{{question}}\"\"\").render(i) for i in tmp_ds]]\n", "c_doc = [kit.perturb(i, epsilon=1.0) for i in [Template(\"\"\"{{choices.text}}\"\"\").render(i) for i in tmp_ds]]\n", "for i,q,c in zip(tmp_ds,q_doc,c_doc):\n", " i['question'] = q\n", " i['choices']['text'] = c" ] }, { "cell_type": "code", "execution_count": 23, "id": "61b30886-746c-43c5-889a-a6583dc939d0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 'Mercury_7179953',\n", " 'question': 'stuff two alpha Rogers are today chap in Department?',\n", " 'choices': {'text': \"['muscular and skeletal', 'digestive and muscular', 'skeletal and pasteiratory', 'respiratory and exhibive']\",\n", " 'label': ['A', 'B', 'C', 'D']},\n", " 'answerKey': 'A',\n", " 'rationale': {...}}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tmp_ds[6]" ] }, { "cell_type": "markdown", "id": "fed90297-9957-4f8b-a53c-37a03d516c78", "metadata": {}, "source": [ "And then send formatted docs to remote LLM for perturbed responses:" ] }, { "cell_type": "code", "execution_count": 33, "id": "5b8bd833-fb0f-418b-bd9b-6452e8ae4d6c", "metadata": {}, "outputs": [], "source": [ "template = Template(arc_e_template_r)\n", "docs_to_infer = [tokenizer.apply_chat_template([{'role':'system', 'content': 'you are a helpful assistant'}, {'role':'user', 'content': template.render(i)}], add_generation_prompt=True, tokenize=False) for i in tmp_ds]\n", "p_results = api.inference(docs_to_infer, {\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", "})" ] }, { "cell_type": "code", "execution_count": 37, "id": "187361fa-8b73-4a01-9039-f52ec98a5791", "metadata": {}, "outputs": [], "source": [ "for i, r in zip(ds, p_results):\n", " i['p_rationale'] = r\n", "\n", "for i,q,c in zip(ds, q_doc, c_doc):\n", " i['p_question'] = q\n", " i['p_choice'] = c" ] }, { "cell_type": "markdown", "id": "927b2265-4e87-4275-98dc-7f33d405e19a", "metadata": {}, "source": [ "### Organize Training Data\n", "\n", "As described in the original paper, we need to train the encoder and decoder in one model.\n", "We can organize the training data using templates below:" ] }, { "cell_type": "code", "execution_count": 47, "id": "9292ad25-12c7-418a-9e77-b433b95f57ac", "metadata": {}, "outputs": [], "source": [ "train_data = []\n", "\n", "encoder_prompt = Template(\"\"\"Disrupt the main words in the original text so that it becomes difficult to recognize, but at the same time, try to maintain the original meaning as much as possible. Use to end your reply.\n", "Origin Doc: \n", "Question:{{question}}\n", "Choices:{{choices.text}}\n", "\n", "Perturbed Doc:\n", "\"\"\")\n", "\n", "encoder_out = Template(\"\"\"\n", "Question:{{p_question}}\n", "Choices:{{p_choice}}\n", "\"\"\")\n", "\n", "decoder_in = Template(\"\"\"This is a perturbed question and its corresponding answer(rationale). And following is the original question. Try to recover the correct rationale from docs provided.\n", "\n", "Perturbed doc and rationale:\n", "Question:{{p_question}}\n", "Choices:{{p_choice}}\n", "Rationale:{{p_rationale}}\n", "\n", "Original Doc:\n", "Question:{{question}}\n", "Choices:{{choices.text}}\n", "\n", "Recover Rationale:\n", "\"\"\")\n", "\n", "decoder_out = Template(\"\"\"{{rationale}}\"\"\")\n", "\n", "\n", "for i in ds:\n", " a = {}\n", " a['encoder_in'] = encoder_prompt.render(i)\n", " a['encoder_out'] = encoder_out.render(i)\n", " a['decoder_in'] = decoder_in.render(i)\n", " a['decoder_out'] = decoder_out.render(i)\n", " train_data.append(a)\n", "\n", "import torch\n", "torch.save(train_data, './slm_ed_train_data.pkl')" ] }, { "cell_type": "markdown", "id": "dd73db44-4e73-4c1e-8f27-755522587636", "metadata": {}, "source": [ "## Train Script\n", "\n", "The key step: preparing data is now done. Then we can train a SLM model using the train data. You can use following dataset&trainer class to train an encoder-decoder slm model. Here we use Qwen-0.5B as the example." ] }, { "cell_type": "code", "execution_count": 51, "id": "eb01c591-3c04-4317-8bb0-f55846fb1b66", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 52, "id": "f0da4e10-af80-4216-8ff8-5816dabc8526", "metadata": {}, "outputs": [], "source": [ "model = AutoModelForCausalLM.from_pretrained('/data/cephfs/llm/models/Qwen1.5-0.5B/').half().cuda()" ] }, { "cell_type": "code", "execution_count": 75, "id": "634fc973-29c8-499e-a99e-d50b7ee54124", "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import Dataset\n", "\n", "class EDDataset(Dataset):\n", "\n", " def __init__(self, tokenizer, train_data, max_input_length=64, max_target_length=64):\n", " self.tokenizer = tokenizer\n", " self.dataset = train_data\n", " self.max_input_length = max_input_length\n", " self.max_target_length = max_target_length\n", " self.max_seq_length = max_input_length + max_target_length + 1\n", "\n", " def get_str_item(self, i) -> dict:\n", "\n", " data_item = self.dataset[i]\n", " ret_dict = {\n", " 'encoder':{\n", " 'input': data_item['encoder_in'],\n", " 'output': data_item['encoder_out']\n", " },\n", " 'decoder':{\n", " 'input': data_item['decoder_in'],\n", " 'output': data_item['decoder_out']\n", " }\n", " }\n", " return ret_dict\n", "\n", " def _process_item(self, data_item):\n", "\n", " a_ids = self.tokenizer.encode(text=data_item['input'], add_special_tokens=True, truncation=True,\n", " max_length=self.max_input_length)\n", " b_ids = self.tokenizer.encode(text=data_item['output'], add_special_tokens=False, truncation=True,\n", " max_length=self.max_target_length)\n", " context_length = len(a_ids)\n", " input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id]\n", " labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id]\n", " pad_len = self.max_seq_length - len(input_ids)\n", " input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len\n", " labels = labels + [self.tokenizer.pad_token_id] * pad_len\n", " labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels]\n", "\n", " assert len(input_ids) == len(labels), f\"length mismatch: {len(input_ids)} vs {len(labels)}\"\n", "\n", " return {\n", " \"input_ids\": input_ids,\n", " \"labels\": labels\n", " }\n", "\n", " def get_tokenized_item(self, i) -> dict: \n", "\n", " str_item = self.get_str_item(i)\n", " ret_dict = {\n", " 'encoder': self._process_item(str_item['encoder']),\n", " 'docoder': self._process_item(str_item['decoder'])\n", " }\n", " return ret_dict\n", "\n", " def __getitem__(self, i) -> dict:\n", " item = self.get_tokenized_item(i)\n", " return item" ] }, { "cell_type": "code", "execution_count": 76, "id": "5f914b1f-cf14-4bdc-acc9-ae1b73cf857c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] } ], "source": [ "train_ds = EDDataset(AutoTokenizer.from_pretrained('/data/cephfs/llm/models/Qwen1.5-0.5B/'), train_data)" ] }, { "cell_type": "code", "execution_count": null, "id": "817084b2-2439-45d8-aa1b-da0b1a8a2846", "metadata": {}, "outputs": [], "source": [ "print(train_ds.get_str_item(0))\n", "print(train_ds[0])" ] }, { "cell_type": "code", "execution_count": 82, "id": "303bcb23-d54b-4375-bad2-bf5450c14f28", "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.fedcot.slm_encoder_decoder_trainer import EncoderDecoderPrefixTrainer, EDPrefixDataCollator" ] }, { "cell_type": "markdown", "id": "aa5a0b4f-cd03-4867-8753-fc5bcb036c69", "metadata": {}, "source": [ "After completing the setup, you can utilize the EncoderDecoderPrefixTrainer, EDPrefixDataCollator, and the training dataset to train an SLM encoder-decoder model following the Huggingface approach! " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: doc/tutorial/fedcot/fedcot_tutorial.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "9234355d-389f-484f-9fc2-7b17563b3390", "metadata": {}, "source": [ "# FedCoT Tutorial\n", "\n", "## Introduction to FedCoT\n", "\n", "FedCoT (Federated Chain-of-Thought) is a novel framework designed to distill knowledge from large language models (LLMs) to small language models (SLMs) while ensuring data privacy. The framework addresses two major challenges faced by LLM deployment in real-world applications: the privacy of domain-specific knowledge and resource constraints.\n", "\n", "FedCoT adopts a server-client architecture where the client sends perturbed prompts to the server-side LLM for inference, generating perturbed rationales. The client then decodes these rationales and uses them to enrich the training of its task-specific SLM, ultimately enhancing its performance.\n", "\n", "FedCoT introduces two privacy protection strategies: \n", "- **the Exponential Mechanism Strategy**\n", "- **the Encoder-Decoder Strategy**\n", " \n", "The Exponential Mechanism Strategy utilizes a DP(differential privacy) based exponential mechanism to obfuscate user prompts, while the Encoder-Decoder Strategy employs a specialized Encoder-Decoder SLM to encode and decode perturbed prompts and rationales. These strategies effectively balance user privacy and the usability of rationales, allowing for secure and enhanced training of the client's SLM without compromising on privacy concerns.\n", "\n", "Through experiments on various text generation tasks, FedCoT demonstrates its effectiveness in training task-specific SLMs with enhanced performance, significantly improving the SLM's capabilities while prioritizing data privacy protection. For more details, please refer to the paper: [FedCoT: Federated Chain-of-Thought Distillation for Large Language Models](https://arxiv.org/pdf/2406.12403).\n", "\n", "**Before reading this tutorial, we strongly recommend that you first read [the InferDPT](./) tutorial.**\n", "\n", "## Use the Infer Client & Server\n", "\n", "In this section, we are going to introduce the inference part, which is the key part of FedCoT that generates useful rationales with privacy-preserving. You can use InferDPT(which utilize the Exponential Mechanism Strategy) or specifically trained SLM as the text encoder & decoder. In this section, we retrieve a sample from the arc-easy dataset as an example:" ] }, { "cell_type": "code", "execution_count": 10, "id": "c443c920-31ff-446a-801f-d7a02409a8c0", "metadata": {}, "outputs": [], "source": [ "test_example = {'id': 'Mercury_7220990',\n", "'question': 'Which factor will most likely cause a person to develop a fever?',\n", "'choices': {'text': ['a leg muscle relaxing after exercise',\n", "'a bacterial population in the bloodstream',\n", "'several viral particles on the skin',\n", "'carbohydrates being digested in the stomach'],\n", "'label': ['A', 'B', 'C', 'D']},\n", "'answerKey': 'B'}" ] }, { "cell_type": "markdown", "id": "46646b18-46bb-476d-8b1d-1ef661446929", "metadata": {}, "source": [ "### Fate Context\n", "\n", "We need to create fate context to enable the communication between client and server. Then, we can initialize infer client(who will encodes the raw prompt and decodes the perturbed response) and server(who deploys the LLM) to enable secure inference." ] }, { "cell_type": "code", "execution_count": 6, "id": "0cc8e8f8-88d7-45ab-a988-5ead06356418", "metadata": { "tags": [] }, "outputs": [], "source": [ "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))" ] }, { "cell_type": "markdown", "id": "c75dbcda-1a40-421d-ab1b-92eca5600866", "metadata": {}, "source": [ "### The DP based Strategy(InferDPT)\n", "\n", "As outlined in the [InferDPT tutorial](./), you can initialize the InferDPT client and server to facilitate secure and private inference. Prior to executing the InferDPT component, it is recommended to generate the InferDPT kit by following the step-by-step instructions provided in the tutorial.\n", "\n", "#### Client-Side Code\n", "\n", "On the client side, we load the pre-computed inferdpt-kit and deploy a local SLM as the decoding model." ] }, { "cell_type": "code", "execution_count": null, "id": "ff0f317f-414f-4b9f-84e6-b992b31350cb", "metadata": {}, "outputs": [], "source": [ "from fate_llm.inference.api import APICompletionInference\n", "from fate_llm.algo.inferdpt import inferdpt\n", "from fate_llm.algo.inferdpt.utils import InferDPTKit\n", "import sys\n", "\n", "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))\n", "\n", "ctx = create_ctx(guest)\n", "save_kit_path = 'your path'\n", "kit = InferDPTKit.load_from_path(save_kit_path)\n", "# local deployed small model as decoding model\n", "inference = APICompletionInference(api_url=\"http://127.0.0.1:8887/v1\", model_name='./Qwen1.5-0.5B', api_key='EMPTY')\n", "\n", "test_example = {'id': 'Mercury_7220990',\n", "'question': 'Which factor will most likely cause a person to develop a fever?',\n", "'choices': {'text': ['a leg muscle relaxing after exercise',\n", "'a bacterial population in the bloodstream',\n", "'several viral particles on the skin',\n", "'carbohydrates being digested in the stomach'],\n", "'label': ['A', 'B', 'C', 'D']},\n", "'answerKey': 'B'}\n", "\n", "\n", "doc_template = \"\"\"{{question}} \n", "Choices:{{choices.text}}\n", "\"\"\"\n", "\n", "instruction_template=\"\"\"\n", "[INST]\n", "Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\"\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Please explain:\n", "Question:{{perturbed_doc}}\n", "Rationale:\n", "[/INST]\n", "\"\"\"\n", "\n", "decode_template = \"\"\"Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\"\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Question:{{perturbed_doc}}\n", "Rationale:{{perturbed_response | replace('\\n', '')}}\n", "\n", "Please explain:\n", "Question:{{question}} \n", "Choices:{{choices.text}}\n", "Rationale:\n", "\"\"\"\n", "\n", "inferdpt_client = inferdpt.InferDPTClient(ctx, kit, inference, epsilon=3.0)\n", "result = inferdpt_client.inference([test_example], doc_template, instruction_template, decode_template, \\\n", " remote_inference_kwargs={\n", " 'stop': ['<\\s>'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " },\n", " local_inference_kwargs={\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " })\n", "print('result is {}'.format(result[0]['inferdpt_result']))" ] }, { "cell_type": "markdown", "id": "96fbcb01-6907-432f-8393-ae1746559c3a", "metadata": {}, "source": [ "#### Server Side Code" ] }, { "cell_type": "code", "execution_count": 9, "id": "960a476c-50a5-40fb-847d-02101cea27ae", "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.inferdpt.inferdpt import InferDPTServer\n", "import sys\n", "from fate_llm.inference.api import APICompletionInference\n", "\n", "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))\n", "\n", "ctx = create_ctx(arbiter)\n", "# Api to a LLM\n", "inference_server = APICompletionInference(api_url=\"http://127.0.0.1:8888/v1\", model_name='./Mistral-7B-Instruct-v0.2', api_key='EMPTY')\n", "inferdpt_server = InferDPTServer(ctx, inference_server)\n", "inferdpt_server.inference()" ] }, { "cell_type": "markdown", "id": "16f908a7-9187-461a-93db-9945456d502d", "metadata": {}, "source": [ "Start two terminal and launch client&server scripts simultaneously. On the client side we can get the answer:\n", "\n", "```\n", "The given question asks which factor will most likely cause a person to develop a fever. The factors mentioned are a leg muscle relaxing after exercise, a bacterial population in the bloodstream, several viral particles on the skin, and carbohydrates being digested in the stomach. The question is asking which factor is most likely to cause a person to develop a fever. The factors are all related to the body's internal environment, but the most likely factor is a bacterial population in the bloodstream. This is because bacteria can cause a fever, and the body's immune system responds to the infection by producing antibodies that can fight off the bacteria. Therefore, the answer is 'a bacterial population in the bloodstream'\n", "```" ] }, { "cell_type": "markdown", "id": "fb36a485-2fa8-4629-a2cf-2d53fdbbcc5f", "metadata": {}, "source": [ "### The Encoder-Decoder Model Strategy\n", "\n", "Similar to the InferDPT, we can initialize SLMEncoderDecoderClient and SLMEncoderDecoderServer to enable secure inference.\n", "The client will encode the raw prompt using local slm model and then decoded it with the same model\n", "\n", "#### Client Side Code" ] }, { "cell_type": "code", "execution_count": 4, "id": "cd174244-8640-4cb2-8609-ac6468f5a6f5", "metadata": { "tags": [] }, "outputs": [], "source": [ "from fate_llm.inference.api import APICompletionInference\n", "from fate_llm.algo.fedcot.encoder_decoder.slm_encoder_decoder import SLMEncoderDecoderClient\n", "\n", "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))\n", "\n", "\n", "test_example = {'id': 'Mercury_7220990',\n", "'question': 'Which factor will most likely cause a person to develop a fever?',\n", "'choices': {'text': ['a leg muscle relaxing after exercise',\n", "'a bacterial population in the bloodstream',\n", "'several viral particles on the skin',\n", "'carbohydrates being digested in the stomach'],\n", "'label': ['A', 'B', 'C', 'D']},\n", "'answerKey': 'B'\n", "}\n", "\n", "\n", "encode_prompt = \"\"\"Disrupt the main words in the original text so that it becomes difficult to recognize, but at the same time, try to maintain the original meaning as much as possible. Use to end your reply.\n", "Origin Doc:Question:{{question}}\n", "Choices:{{choices.text}}\n", "Perturb Doc: \n", "\"\"\"\n", "\n", "decode_prompt = \"\"\"This is a perturbed question and its corresponding answer(rationale). And following is the original question. Try to recover the correct rationale from docs provided.\n", "\n", "Perturbed doc and rationale:\n", "{{perturbed_doc}}\n", "Rationale:{{perturbed_response}}\n", "\n", "Original Doc:\n", "Question:{{question}}\n", "Choices:{{choices.text}}\n", "\n", "Recover Rationale:\n", "\"\"\"\n", "\n", "instruction_template = \"\"\"<|im_start|>system\n", "You are a helpful assistant<|im_end|>\n", "<|im_start|>user\n", "Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\n", "\n", "Example(s):\n", "Question:Which factor will most likely cause a person to develop a fever?\n", "Choices:['a leg muscle relaxing after exercise', 'a bacterial population in the bloodstream', 'several viral particles on the skin', 'carbohydrates being digested in the stomach']\n", "Rationale:A bacterial infection in the bloodstream triggers the immune system to respond, therefore often causing a fever as the body tries to fight off the bacteria. Therefore, the answer is 'a bacterial population in the bloodstream'\n", "\n", "Please explain:\n", "{{perturbed_doc}}\n", "Rationale:\n", "<|im_end|>\n", "<|im_start|>assistant\n", "\"\"\"\n", "\n", "ctx = create_ctx(guest)\n", "model_name = 'Deploy your encoder decoder model'\n", "# api_url to your locally deployed encoder decoder\n", "api = APICompletionInference(api_url='http://127.0.0.1:8887/v1', api_key='EMPTY', model_name=model_name)\n", "client = SLMEncoderDecoderClient(ctx, api)\n", "result = client.inference([test_example], encode_prompt, instruction_template, decode_prompt, \\\n", " remote_inference_kwargs={\n", " 'stop': ['<\\s>'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " },\n", " local_inference_kwargs={\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " })\n", "print('result is {}'.format(result[0]['inferdpt_result']))" ] }, { "cell_type": "markdown", "id": "1a865536-7814-40a2-a814-d00e46f2787f", "metadata": {}, "source": [ "#### Server Side Code" ] }, { "cell_type": "code", "execution_count": 7, "id": "cced44b0-0dcb-4427-8efe-a04135b246ac", "metadata": {}, "outputs": [], "source": [ "from fate_llm.inference.api import APICompletionInference\n", "from fate_llm.algo.fedcot.encoder_decoder.slm_encoder_decoder import SLMEncoderDecoderServer\n", "\n", "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))\n", "\n", "ctx = create_ctx(arbiter)\n", "# api url&name are depolyed LLM\n", "model_name = '/data/cephfs/llm/models/Qwen1.5-14B-Chat/'\n", "api = APICompletionInference(api_url='http://127.0.0.1:8888/v1', api_key='EMPTY', model_name=model_name)\n", "server = SLMEncoderDecoderServer(ctx, api)\n", "server.inference()" ] }, { "cell_type": "markdown", "id": "c38ed7a6-2eb2-4f46-b59c-eaafcc9a5b7a", "metadata": {}, "source": [ "Start two terminal and launch client&server scripts simultaneously. On the client side we can get the answer:\n", "\n", "```\n", "A fever is typically caused by a bacterial population in the bloodstream, as it is a response to an infection. So the answer is 'a bacterial population in the bloodstream'.\n", "```" ] }, { "cell_type": "markdown", "id": "41fbbefd-e931-4e95-9d28-9675ff7865a3", "metadata": {}, "source": [ "## Prefix Dataset & FedCoT Trainer\n", "\n", "Now that we can carry out privacy-preserving inference and acquire rationales, the next step is to train a new task-specific model, enhanced by the rationales generated by the LLMs.\n", "\n", "In this section, we will introduce the PrefixDataset and FedCoTTrainer, which facilitate training tasks with the added benefit of supplementary rationales. The PrefixDataset allows you to assign various text prefixes, guiding the model to produce different text targets. With FedCoTTrainer, the model is trained to generate both text labels and text rationales at each update step, ultimately leading to superior performance compared to training on the raw dataset alone.\n", "\n", "### Prepare dataset\n", "In this tutorial, we will use the arc-easy dataset." ] }, { "cell_type": "code", "execution_count": null, "id": "e25377d0-1a7e-4e8c-aa9f-3bcb03ae0c45", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "dataset = load_dataset(\"arc_easy\")\n", "dataset.save_to_disk('path_to_save/arce')" ] }, { "cell_type": "markdown", "id": "9166110f-bf67-4bf1-9da8-04c16bd79423", "metadata": {}, "source": [ "Let’s proceed with testing the PrefixDataset. We can utilize Jinja2 templates to structure the text and append prefixes or suffixes to our training data.\n", "\n", "Please note that at this stage, the dataset does not contain rationales. In the 'rationale_output_template', the key used for the inference results is ‘infer_result’. We can perform secure inference using the FedCoTTrainer and then integrate the rationale results, keyed as ‘infer_result’, into the PrefixDataset." ] }, { "cell_type": "code", "execution_count": 17, "id": "fdbd93d6-45f3-404f-813e-9ca1fd6def04", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] } ], "source": [ "from fate_llm.dataset.fedcot_dataset import PrefixDataset\n", "\n", "pds = PrefixDataset(\n", " tokenizer_path='/data/cephfs/llm/models/Qwen1.5-0.5B/',\n", " predict_input_template=\"\"\"Predict:\n", "Question:{{question}}\n", "Choices:{{choices.text}}\n", "Answer:\n", " \"\"\",\n", " predict_output_template=\"\"\"{{choices.text[choices.label.index(answerKey)]}}\"\"\",\n", " rationale_input_template=\"\"\"Explain:\n", "Question:{{question}}\n", "Choices:{{choices.text}}\n", "Rationale:\n", " \"\"\",\n", " rationale_output_template=\"\"\"{{infer_result}}\"\"\",\n", " max_input_length=128,\n", " max_target_length=128,\n", " split_key='train'\n", " )\n", "\n", "\n", "pds.load('path_to_save/arce')" ] }, { "cell_type": "code", "execution_count": 27, "id": "100eeb69-8bd2-4e66-b1cc-667f95e47f23", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 'Mercury_7220990',\n", " 'question': 'Which factor will most likely cause a person to develop a fever?',\n", " 'choices': {'text': ['a leg muscle relaxing after exercise',\n", " 'a bacterial population in the bloodstream',\n", " 'several viral particles on the skin',\n", " 'carbohydrates being digested in the stomach'],\n", " 'label': ['A', 'B', 'C', 'D']},\n", " 'answerKey': 'B'}" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pds.dataset[0] # the structure is the same as hf dataset" ] }, { "cell_type": "code", "execution_count": 21, "id": "6f0356ef-f94b-41db-ab66-b1d0eb862eca", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'predict': {'input': \"Predict:\\nQuestion:Which factor will most likely cause a person to develop a fever?\\nChoices:['a leg muscle relaxing after exercise', 'a bacterial population in the bloodstream', 'several viral particles on the skin', 'carbohydrates being digested in the stomach']\\nAnswer:\\n \",\n", " 'output': 'a bacterial population in the bloodstream'},\n", " 'rationale': {'input': \"Explain:\\nQuestion:Which factor will most likely cause a person to develop a fever?\\nChoices:['a leg muscle relaxing after exercise', 'a bacterial population in the bloodstream', 'several viral particles on the skin', 'carbohydrates being digested in the stomach']\\nRationale:\\n \",\n", " 'output': '\\n '}}" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pds.get_str_item(0) # we can see that the output of rationale term is empty" ] }, { "cell_type": "code", "execution_count": 25, "id": "6a227af7-f24a-46bd-9af7-78584a381b33", "metadata": {}, "outputs": [], "source": [ "print(pds[0]) # show tokenized, for the sake of breif we dont show it in this tutorial doc" ] }, { "cell_type": "markdown", "id": "e0382a33-7a45-43a3-8ed3-58ed1d1b07d8", "metadata": {}, "source": [ "### The FedCoTTrainer\n", "\n", "Here we introduce the FedCoTTrainer which is develop based on Huggingface trainer and supports collaboratively training a task with raw labels and additional rationales. Here show how the compute loss function is realized:" ] }, { "cell_type": "code", "execution_count": null, "id": "b40b7d99-9ef8-43f9-8e28-db96d96af62a", "metadata": {}, "outputs": [], "source": [ "def compute_loss(self, model, inputs, return_outputs=False):\n", "\n", " label_outputs = model(**inputs['predict'])\n", " cot_outputs = model(**inputs['rationale'])\n", " loss = self.alpha * cot_outputs.loss + (1. - self.alpha) * label_outputs.loss\n", " return (loss, {'rationale_loss': cot_outputs, 'predict_loss': label_outputs}) if return_outputs else loss" ] }, { "cell_type": "markdown", "id": "ff1cee5d-68e1-4caf-96b9-132b27b46dca", "metadata": {}, "source": [ "You have the option to choose from three distinct modes: ‘infer_only’, ‘train_only’, and ‘infer_and_train’, to meet your specific requirements.\n", "- infer_only: Only generate the rationales and they will be saved to the output_dir\n", "- train_only: Local training only\n", "- infer_and_train: Generate rationales, and then load them into PrefixDataset and start training\n", " \n", "In this instance, we will opt for the ‘infer_and_train’ mode to initially generate rationales with the assistance of the remote LLM. To activate the inference process, it is necessary to initialize the infer client and server for both the client-side and server-side trainers, as demonstrated in the preceding sections.\n", "\n", "Below is an FedCoT example. We ran this example on a machine equipped with 4 V100-32G GPUs. We launch the client script using deepspeed. LLM is depolyed on another machine." ] }, { "cell_type": "markdown", "id": "c559341a-d133-4a24-8f1a-35cd6d2a26d3", "metadata": {}, "source": [ "## FedCoT Example\n", "\n", "### Client Script(deepspeed_run.py)\n", "\n", "This script show how to setup a fedcot task on the client side." ] }, { "cell_type": "code", "execution_count": null, "id": "e4710fda-904a-4e90-bc65-beec7594703f", "metadata": {}, "outputs": [], "source": [ "import logging\n", "import os\n", "import sys\n", "from transformers import (\n", " AutoTokenizer,\n", " HfArgumentParser,\n", " Seq2SeqTrainingArguments,\n", ")\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "from typing import List\n", "from fate_llm.algo.inferdpt.utils import InferDPTKit\n", "from fate_llm.dataset.fedcot_dataset import PrefixDataset\n", "from fate_llm.algo.fedcot.fedcot_trainer import FedCoTTrainerClient\n", "from fate_llm.data.data_collator.fedcot_collator import PrefixDataCollator\n", "from fate_llm.algo.inferdpt import inferdpt\n", "\n", "\n", "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))\n", "\n", "\n", "logger = logging.getLogger(__name__)\n", "\n", "\n", "doc_template = \"\"\"{{question}} \n", "Choices:{{choices.text}}\n", "\"\"\"\n", "\n", "instruction_template=\"\"\"\n", "[INST]\n", "Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\"\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Please explain:\n", "Question:{{perturbed_doc}}\n", "Rationale:\n", "[/INST]\n", "\"\"\"\n", "\n", "decode_template = \"\"\"Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\"\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Question:{{perturbed_doc}}\n", "Rationale:{{perturbed_response | replace('\\n', '')}}\n", "\n", "Please explain:\n", "Question:{{question}} \n", "Choices:{{choices.text}}\n", "Rationale:\n", "\"\"\"\n", " \n", "\n", "if __name__ == \"__main__\":\n", " \n", " parser = HfArgumentParser(Seq2SeqTrainingArguments)\n", " if len(sys.argv) == 2 and sys.argv[1].endswith(\".json\"):\n", " training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]\n", " else:\n", " training_args = parser.parse_args_into_dataclasses()[0]\n", "\n", " model_path = '/data/cephfs/llm/models/Qwen1.5-0.5B/'\n", " pds = PrefixDataset(\n", " tokenizer_path=model_path,\n", " predict_input_template=\"\"\"Predict:\n", "Question:{{question}}\n", "Choices:{{choices.text}}\n", "Answer:\n", " \"\"\",\n", " predict_output_template=\"\"\"{{choices.text[choices.label.index(answerKey)]}}\"\"\",\n", " rationale_input_template=\"\"\"Explain:\n", "Question:{{question}}\n", "Choices:{{choices.text}}\n", "Rationale:\n", " \"\"\",\n", " rationale_output_template=\"\"\"{{infer_result}}\n", " \"\"\",\n", " max_input_length=128,\n", " max_target_length=128,\n", " split_key='train'\n", " )\n", " pds.load('/data/cephfs/llm/datasets/arce/')\n", " \n", " model = AutoModelForCausalLM.from_pretrained(model_path).half().cuda()\n", " tokenizer = AutoTokenizer.from_pretrained(model_path)\n", " model.gradient_checkpointing_enable()\n", " model.enable_input_require_grads()\n", "\n", " ctx = create_ctx(guest)\n", " if training_args.local_rank == 0:\n", " # only rank 0 need to load infer instance\n", " save_kit_path = 'your path'\n", " kit = InferDPTKit.load_from_path(save_kit_path)\n", " # local deployed small model as decoding model\n", " from fate_llm.algo.inferdpt.inference.api import APICompletionInference\n", " inference = APICompletionInference(api_url=\"http://xxxx/v1\", model_name='./Qwen1.5-0.5B', api_key='EMPTY')\n", " client = inferdpt.InferDPTClient(ctx, kit, inference, epsilon=3.0)\n", " else:\n", " client = None\n", " \n", " trainer = FedCoTTrainerClient(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " tokenizer=tokenizer, \n", " train_set=pds,\n", " data_collator=PrefixDataCollator(tokenizer),\n", " mode='infer_and_train',\n", " infer_client=client,\n", " encode_template=doc_template,\n", " decode_template=decode_template,\n", " instruction_template=instruction_template,\n", " remote_inference_kwargs={\n", " 'stop': ['<\\s>'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " },\n", " local_inference_kwargs={\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " }\n", " )\n", "\n", " trainer.train()\n", "\n", " if training_args.local_rank == 0:\n", " model.save_pretrained(training_args.output_dir)\n", " tokenizer.save_pretrained(training_args.output_dir)" ] }, { "cell_type": "markdown", "id": "962dd399-1dec-4164-bd86-15aa8550c50b", "metadata": {}, "source": [ "### Server Script(server.py)\n", "\n", "This script show how to setup a fedcot task on the server side." ] }, { "cell_type": "code", "execution_count": null, "id": "91b42972-5308-4ccf-a768-f7dfa087313e", "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.inferdpt.inferdpt import InferDPTServer\n", "from fate_llm.algo.fedcot.fedcot_trainer import FedCoTTraineServer\n", "import sys\n", "\n", "\n", "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))\n", "\n", "\n", "from fate_llm.algo.inferdpt.inference.api import APICompletionInference\n", "api = APICompletionInference(api_url='http://xxxx:8080/v1', api_key='EMPTY', model_name='/data/cephfs/llm/models/Qwen1.5-14B-Chat')\n", "\n", "ctx = create_ctx(arbiter)\n", "server_api = InferDPTServer(ctx, api)\n", "server = FedCoTTraineServer(ctx, server_api)\n", "server.train()" ] }, { "cell_type": "markdown", "id": "125dd68e-c7d4-41aa-9972-4881b1330fb6", "metadata": {}, "source": [ "### Start script\n", "\n", "You can launch client side training with following script:\n", "\n", "```\n", "deepspeed --num_nodes 1 --num_gpus 4 deepspeed_run.py \\\n", " --output_dir \"./\" \\\n", " --per_device_train_batch_size \"1\" \\\n", " --gradient_accumulation_steps \"8\" \\\n", " --max_steps \"750\" \\\n", " --fp16 \\\n", " --logging_steps 10 \\\n", " --save_only_model \\\n", " --deepspeed \"./ds_config.json\" \n", "```" ] }, { "cell_type": "markdown", "id": "0b506c1c-51f4-448d-9b0b-adf1a71cc7cf", "metadata": {}, "source": [ "and the ds_config.json is\n", "```\n", "{ \n", " \"train_micro_batch_size_per_gpu\": 1,\n", " \"gradient_accumulation_steps\": 8,\n", " \"optimizer\": {\n", " \"type\": \"AdamW\",\n", " \"params\": {\n", " \"lr\": 5e-5\n", " }\n", " },\n", " \"fp16\": {\n", " \"enabled\": true\n", " },\n", " \"zero_optimization\": {\n", " \"stage\": 0\n", " }\n", "}\n", "```" ] }, { "cell_type": "markdown", "id": "613fbfb6-ac9e-485b-8587-ffef1e2361c1", "metadata": {}, "source": [ "And server side:" ] }, { "cell_type": "markdown", "id": "5b50adf0-8f9c-40e5-9a7d-40a70e30a420", "metadata": {}, "source": [ "```python server.py```" ] }, { "cell_type": "markdown", "id": "28a5de71-25fd-4042-a6b7-0ec2c505eaee", "metadata": {}, "source": [ "## FedCoT Pipeline Example\n", "\n", "You have the capability to submit a FedCoT task within the FATE pipeline. By appropriately configuring the necessary settings, you can execute FedCoT in a production environment." ] }, { "cell_type": "code", "execution_count": null, "id": "52f1e19b-da8e-4977-adb1-42fb84dee407", "metadata": {}, "outputs": [], "source": [ "from fate_client.pipeline.components.fate.nn.loader import Loader\n", "import argparse\n", "from fate_client.pipeline.utils import test_utils\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from fate_client.pipeline import FateFlowPipeline\n", "\n", "\n", "def main(config=\"../../config.yaml\", namespace=\"\"):\n", " # obtain config\n", " if isinstance(config, str):\n", " config = test_utils.load_job_config(config)\n", " parties = config.parties\n", " guest = '9999'\n", " host = parties.host[0]\n", " arbiter = '10000'\n", "\n", " pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n", "\n", " reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest))\n", " reader_0.guest.task_parameters(\n", " namespace=\"experiment\",\n", " name=\"arc_e_example\"\n", " )\n", "\n", " model_conf = Loader(module_name='fate_llm.model_zoo.hf_model', item_name='HFAutoModelForCausalLM', \n", " pretrained_model_name_or_path='/data/cephfs/llm/models/Qwen1.5-0.5B/').to_dict()\n", " data_collator_conf = Loader(module_name='fate_llm.data.data_collator.fedcot_collator', item_name='get_prefix_data_collator', tokenizer_name_or_path='/data/cephfs/llm/models/Qwen1.5-0.5B/').to_dict()\n", "\n", " infer_init_conf_client = {\n", " 'module_name': 'fate_llm.algo.inferdpt.init.default_init',\n", " 'item_name': 'InferDPTAPIClientInit'\n", " }\n", "\n", " infer_init_conf_server = {\n", " 'module_name': 'fate_llm.algo.inferdpt.init.default_init',\n", " 'item_name': 'InferDPTAPIServerInit'\n", " }\n", "\n", " dataset_conf = {\n", " 'module_name': 'fate_llm.dataset.fedcot_dataset',\n", " 'item_name': 'PrefixDataset',\n", " 'kwargs':dict(\n", " tokenizer_path='/data/cephfs/llm/models/Qwen1.5-0.5B/',\n", " predict_input_template=\"\"\"Predict:\n", " Question:{{question}}\n", " Choices:{{choices.text}}\n", " \"\"\",\n", " predict_output_template=\"\"\"{{choices.text[choices.label.index(answerKey)]}}\"\"\",\n", " rationale_input_template=\"\"\"Explain:\n", " Question:{{question}}\n", " Choices:{{choices.text}}\n", " \"\"\",\n", " rationale_output_template=\"\"\"{{infer_result}}\n", " \"\"\",\n", " max_input_length=128,\n", " max_target_length=128,\n", " split_key='train'\n", " )\n", " }\n", "\n", " encoder_prompt = \"\"\"{{question}}\n", "Choices:{{choices.text}}\n", "\"\"\"\n", "\n", " decoder_prompt = \"\"\"Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.Use to finish your rationle.\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Question:{{perturbed_doc}}\n", "Rationale:{{perturbed_response | replace('\\n', '')}}\n", "\n", "Please explain:\n", "Question:{{question}} \n", "Choices:{{choices.text}}\n", " \"\"\"\n", "\n", " instruction_prompt = \"\"\"<|im_start|>system\n", "You are a helpful assistant<|im_end|>\n", "<|im_start|>user\n", "Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\n", "\n", "Example(s):\n", "Question:Which factor will most likely cause a person to develop a fever?\n", "Choices:['a leg muscle relaxing after exercise', 'a bacterial population in the bloodstream', 'several viral particles on the skin', 'carbohydrates being digested in the stomach']\n", "Rationale:A bacterial infection in the bloodstream triggers the immune system to respond, therefore often causing a fever as the body tries to fight off the bacteria. Therefore, the answer is 'a bacterial population in the bloodstream'\n", "\n", "Please explain:\n", "Question:{{perturbed_doc}}\n", "Rationale:\n", "<|im_end|>\n", "<|im_start|>assistant\n", " \"\"\"\n", "\n", " remote_inference_kwargs={\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " }\n", "\n", " local_inference_kwargs={\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " }\n", "\n", " ds_config = { \n", " \"train_micro_batch_size_per_gpu\": 1,\n", " \"gradient_accumulation_steps\": 8,\n", " \"optimizer\": {\n", " \"type\": \"AdamW\",\n", " \"params\": {\n", " \"lr\": 5e-5\n", " }\n", " },\n", " \"fp16\": {\n", " \"enabled\": True\n", " },\n", " \"zero_optimization\": {\n", " \"stage\": 0\n", " }\n", " }\n", "\n", " training_args_dict = dict(\n", " per_device_train_batch_size=1, \n", " gradient_accumulation_steps=8,\n", " logging_steps=10,\n", " max_steps=30,\n", " fp16=True,\n", " log_level='debug'\n", " )\n", "\n", " mode = 'infer_and_train'\n", "\n", " client_conf = dict(\n", " model_conf=model_conf,\n", " dataset_conf=dataset_conf,\n", " training_args_conf=training_args_dict,\n", " data_collator_conf=data_collator_conf,\n", " mode=mode,\n", " infer_inst_init_conf=infer_init_conf_client,\n", " encode_template=encoder_prompt,\n", " instruction_template=instruction_prompt,\n", " decode_template=decoder_prompt,\n", " remote_inference_kwargs=remote_inference_kwargs,\n", " local_inference_kwargs=local_inference_kwargs,\n", " perturb_doc_key='perturbed_doc',\n", " perturbed_response_key='perturbed_response',\n", " result_key='infer_result'\n", " )\n", "\n", " server_conf = dict(\n", " infer_inst_init_conf=infer_init_conf_server,\n", " mode=mode\n", " )\n", "\n", " homo_nn_0 = HomoNN(\n", " 'nn_0',\n", " train_data=reader_0.outputs[\"output_data\"],\n", " runner_module=\"fedcot_runner\",\n", " runner_class=\"FedCoTRunner\"\n", " )\n", "\n", " homo_nn_0.guest.task_parameters(runner_conf=client_conf)\n", " homo_nn_0.arbiter.task_parameters(runner_conf=server_conf)\n", "\n", " homo_nn_0.guest.conf.set(\"launcher_name\", \"deepspeed\")\n", "\n", " pipeline.add_tasks([reader_0, homo_nn_0])\n", " pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 4}))\n", " pipeline.compile()\n", " pipeline.fit()\n", "\n", "if __name__ == \"__main__\":\n", " parser = argparse.ArgumentParser(\"PIPELINE DEMO\")\n", " parser.add_argument(\"--config\", type=str, default=\"../config.yaml\",\n", " help=\"config file\")\n", " parser.add_argument(\"--namespace\", type=str, default=\"\",\n", " help=\"namespace for data stored in FATE\")\n", " args = parser.parse_args()\n", " main(config=args.config, namespace=args.namespace)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: doc/tutorial/fedkseed/README.md ================================================ ## FedKSeed The Algorithm is based on the paper: [Federated Full-Parameter Tuning of Billion-Sized Language Models with Communication Cost under 18 Kilobytes](https://arxiv.org/pdf/2312.06353.pdf) and the code is adaptor from the https://github.com/alibaba/FederatedScope/tree/FedKSeed. We refactor the code to make it more compatible with (transformers/PyTorch) framework and integrate it into the FATE-LLM framework. The main works include: 1. An KSeedZerothOrderOptimizer class that can be used to optimize model along given direction that generated with random seed. 2. An KSeedZOExtendedTrainer subclass of Trainer from transformers that can be used to train large language models with KSeedZerothOrderOptimizer. 3. Trainers for federated learning with large language models. ================================================ FILE: doc/tutorial/fedkseed/fedkseed-example.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Federated Tuning with FedKSeed methods in FATE-LLM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this tutorial, we will demonstrate how to efficiently train federated large language models using the FATE-LLM framework. In FATE-LLM, we introduce the \"FedKSeed\" module, specifically designed for federated learning with large language models. The Idea of FedKSeed is to use Zeroth-Order-Optimizer to optimize model along given direction that generated with random seed. This method can be used to train large language models in a federated learning setting with extremely low communication cost.\n", "\n", "The Algorithm is based on the paper: [Federated Full-Parameter Tuning of Billion-Sized Language Models\n", "with Communication Cost under 18 Kilobytes](https://arxiv.org/pdf/2312.06353.pdf) and the code is modified from the https://github.com/alibaba/FederatedScope/tree/FedKSeed. We refactor the code to make it more compatible with (transformers/PyTorch) framework and integrate it into the FATE-LLM framework.\n", "\n", "The main works include:\n", "1. An KSeedZerothOrderOptimizer class that can be used to optimize model along given direction that generated with random seed.\n", "2. An KSeedZOExtendedTrainer subclass of Trainer from transformers that can be used to train large language models with KSeedZerothOrderOptimizer.\n", "3. Trainers for federated learning with large language models.\n", "\n", "In this tutorial, we will demonstrate how to use the FedKSeed method to train a large language model in a federated learning setting. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model: datajuicer/LLaMA-1B-dj-refine-150B\n", "\n", "This is the introduction from the Huggingface model hub: [datajuicer/LLaMA-1B-dj-refine-150B](https://huggingface.co/datajuicer/LLaMA-1B-dj-refine-150B)\n", "\n", "> The model architecture is LLaMA-1.3B and we adopt the OpenLLaMA implementation. The model is pre-trained on 150B tokens of Data-Juicer's refined RedPajama and Pile. It achieves an average score of 34.21 over 16 HELM tasks, beating Falcon-1.3B (trained on 350B tokens from RefinedWeb), Pythia-1.4B (trained on 300B tokens from original Pile) and Open-LLaMA-1.3B (trained on 150B tokens from original RedPajama and Pile).\n", "\n", "> For more details, please refer to our [paper](https://arxiv.org/abs/2309.02033).\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2024-02-29T09:27:23.512735Z", "start_time": "2024-02-29T09:27:23.508790Z" }, "collapsed": false }, "outputs": [], "source": [ "# model_name_or_path = \"datajuicer/LLaMA-1B-dj-refine-150B\"\n", "model_name_or_path = \"gpt2\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset: databricks/databricks-dolly-15k\n", "\n", "This is the introduction from the Huggingface dataset hub: [databricks/databricks-dolly-15k](https://huggingface.co/dataset/databricks/databricks-dolly-15k)\n", "\n", "> databricks-dolly-15k is a corpus of more than 15,000 records generated by thousands of Databricks employees to enable large language models to exhibit the magical interactivity of ChatGPT. Databricks employees were invited to create prompt / response pairs in each of eight different instruction categories, including the seven outlined in the InstructGPT paper, as well as an open-ended free-form category. The contributors were instructed to avoid using information from any source on the web with the exception of Wikipedia (for particular subsets of instruction categories), and explicitly instructed to avoid using generative AI in formulating instructions or responses. Examples of each behavior were provided to motivate the types of questions and instructions appropriate to each category\n", "\n", "To use this dataset, you first need to download it from the Huggingface dataset hub:\n", "\n", "```bash\n", "mkdir -p ../../../examples/data/dolly && cd ../../../examples/data/dolly && wget wget https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl\\?download\\=true -O databricks-dolly-15k.jsonl\n", "```\n", "\n", "### Check Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2024-02-29T09:27:26.987779Z", "start_time": "2024-02-29T09:27:24.706218Z" } }, "outputs": [], "source": [ "from fate_llm.dataset.hf_dataset import Dolly15K\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)\n", "special_tokens = tokenizer.special_tokens_map\n", "if \"pad_token\" not in tokenizer.special_tokens_map:\n", " special_tokens[\"pad_token\"] = special_tokens[\"eos_token\"]\n", "\n", "tokenizer.pad_token = tokenizer.eos_token\n", "ds = Dolly15K(split=\"train\", tokenizer_params={\"pretrained_model_name_or_path\": model_name_or_path, **special_tokens},\n", " tokenizer_apply_params=dict(truncation=True, max_length=tokenizer.model_max_length, padding=\"max_length\", return_tensors=\"pt\"))\n", "ds = ds.load('../../../examples/data/dolly')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2024-02-29T09:27:27.875025Z", "start_time": "2024-02-29T09:27:27.867839Z" } }, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['instruction', 'context', 'response', 'category', 'text', 'input_ids', 'attention_mask'],\n", " num_rows: 15011\n", "})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For more details of FATE-LLM dataset setting, we recommend that you read through these tutorials first: [NN Dataset Customization](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Homo-NN-Customize-your-Dataset.ipynb), [Some Built-In Dataset](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Introduce-Built-In-Dataset.ipynb)," ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check local training\n", "\n", "Before submitting a federated learning task, we will demonstrate how to perform local testing to ensure the proper functionality of your custom dataset, model. " ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2024-02-29T09:38:33.175079Z", "start_time": "2024-02-29T09:38:33.168844Z" }, "collapsed": false }, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling\n", "from fate_llm.algo.fedkseed.trainer import KSeedZOExtendedTrainer, KSeedTrainingArguments\n", "from fate_llm.algo.fedkseed.zo_utils import build_seed_candidates, get_even_seed_probabilities\n", "\n", "def test_training(zo_mode=True):\n", " tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path, **special_tokens)\n", " data_collector = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n", " model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name_or_path)\n", "\n", " training_args = TrainingArguments(output_dir='./',\n", " dataloader_num_workers=1,\n", " dataloader_prefetch_factor=1,\n", " remove_unused_columns=True,\n", " learning_rate=1e-5,\n", " per_device_train_batch_size=1,\n", " num_train_epochs=0.01,\n", " )\n", " kseed_args = KSeedTrainingArguments(zo_optim=zo_mode)\n", " trainer = KSeedZOExtendedTrainer(model=model, train_dataset=ds, training_args=training_args, kseed_args=kseed_args,\n", " tokenizer=tokenizer, data_collator=data_collector)\n", " if zo_mode:\n", " seed_candidates = build_seed_candidates(k=kseed_args.k)\n", " seed_probabilities = get_even_seed_probabilities(k=kseed_args.k)\n", " trainer.configure_seed_candidates(seed_candidates, seed_probabilities)\n", " return trainer.train()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2024-02-29T09:39:37.602070Z", "start_time": "2024-02-29T09:38:34.024223Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [151/151 00:59, Epoch 0/1]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=151, training_loss=1.2660519429390005, metrics={'train_runtime': 61.8249, 'train_samples_per_second': 2.428, 'train_steps_per_second': 2.442, 'total_flos': 78910193664000.0, 'train_loss': 1.2660519429390005, 'epoch': 0.01})" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_training(zo_mode=True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2024-02-29T09:41:28.949449Z", "start_time": "2024-02-29T09:39:54.802705Z" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [151/151 01:29, Epoch 0/1]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=151, training_loss=0.6093456950408733, metrics={'train_runtime': 92.6158, 'train_samples_per_second': 1.621, 'train_steps_per_second': 1.63, 'total_flos': 78910193664000.0, 'train_loss': 0.6093456950408733, 'epoch': 0.01})" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_training(zo_mode=False)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "You can see that Zeroth-Order-Optimizer has much worse performance than AdamW, that's the price we need to pay for the low communication cost. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Submit Federated Task\n", "Once you have successfully completed local testing, We can submit a task to FATE. Please notice that this tutorial is ran on a standalone version. **Please notice that in this tutorial we are using a standalone version, if you are using a cluster version, you need to bind the data with the corresponding name&namespace on each machine.**\n", "\n", "In this example we load pretrained weights for gpt2 model." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from fate_client.pipeline import FateFlowPipeline\n", "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_seq2seq_runner\n", "from fate_client.pipeline.components.fate.nn.algo_params import TrainingArguments, FedAVGArguments\n", "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n", "\n", "guest = '10000'\n", "host = '10000'\n", "arbiter = '10000'\n", "\n", "epochs = 0.01\n", "batch_size = 1\n", "lr = 1e-5\n", "\n", "pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n", "pipeline.bind_local_path(path=\"/data/projects/fate/examples/data/dolly\", namespace=\"experiment\",\n", " name=\"dolly\")\n", "time.sleep(5)\n", "\n", "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest, host=host))\n", "reader_0.guest.task_parameters(\n", " namespace=\"experiment\",\n", " name=\"dolly\"\n", ")\n", "reader_0.hosts[0].task_parameters(\n", " namespace=\"experiment\",\n", " name=\"dolly\"\n", ")\n", "\n", "tokenizer_params = dict(\n", " pretrained_model_name_or_path=\"gpt2\",\n", " trust_remote_code=True,\n", ")\n", "conf = get_config_of_seq2seq_runner(\n", " algo='fedkseed',\n", " model=LLMModelLoader(\n", " \"hf_model\",\n", " \"HFAutoModelForCausalLM\",\n", " # pretrained_model_name_or_path=\"datajuicer/LLaMA-1B-dj-refine-150B\",\n", " pretrained_model_name_or_path=\"gpt2\",\n", " trust_remote_code=True\n", " ),\n", " dataset=LLMDatasetLoader(\n", " \"hf_dataset\",\n", " \"Dolly15K\",\n", " split=\"train\",\n", " tokenizer_params=tokenizer_params,\n", " tokenizer_apply_params=dict(\n", " truncation=True,\n", " max_length=1024,\n", " )),\n", " data_collator=LLMDataFuncLoader(\n", " \"cust_func.cust_data_collator\",\n", " \"get_seq2seq_tokenizer\",\n", " tokenizer_params=tokenizer_params,\n", " ),\n", " training_args=TrainingArguments(\n", " num_train_epochs=0.01,\n", " per_device_train_batch_size=batch_size,\n", " remove_unused_columns=True,\n", " learning_rate=lr,\n", " fp16=False,\n", " use_cpu=False,\n", " disable_tqdm=False,\n", " ),\n", " fed_args=FedAVGArguments(),\n", " task_type='causal_lm',\n", " save_trainable_weights_only=True,\n", ")\n", "\n", "conf[\"fed_args_conf\"] = {}\n", "\n", "homo_nn_0 = HomoNN(\n", " 'nn_0',\n", " runner_conf=conf,\n", " train_data=reader_0.outputs[\"output_data\"],\n", " runner_module=\"fedkseed_runner\",\n", " runner_class=\"FedKSeedRunner\",\n", ")\n", "\n", "pipeline.add_tasks([reader_0, homo_nn_0])\n", "pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 1}))\n", "\n", "pipeline.compile()\n", "pipeline.fit()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can use this script to submit the model, but submitting the model will take a long time to train and generate a long log, so we won't do it here." ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: doc/tutorial/fedmkt/README.md ================================================ # FATE-LLM: FedMKT The algorithm is based on paper ["FedMKT: Federated Mutual Knowledge Transfer for Large and SmallLanguage Models"](https://aclanthology.org/2025.coling-main.17.pdf), We integrate its code into the FATE-LLM framework. ## Citation If you publish work that uses FedMKT, please cite FedMKT as follows: ``` @inproceedings{fan2025fedmkt, title={Fedmkt: Federated mutual knowledge transfer for large and small language models}, author={Fan, Tao and Ma, Guoqiang and Kang, Yan and Gu, Hanlin and Song, Yuanfeng and Fan, Lixin and Chen, Kai and Yang, Qiang}, booktitle={Proceedings of the 31st International Conference on Computational Linguistics}, pages={243--255}, year={2025} } ``` ================================================ FILE: doc/tutorial/fedmkt/fedmkt.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Federated Tuning With FedMKT methods in FATE-LLM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this tutorial, we will demonstrate how to efficiently train federated large language models using the FATE-LLM framework. In FATE-LLM, we introduce the \"FedMKT\" module, specifically designed for federated learning with large language models. FedMKT introduces a novel\n", "federated mutual knowledge transfer framework that enables effective knowledge transfer between an LLM deployed on the server and SLMs residing on clients.\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The Algorithm is based on paper [\"FedMKT: Federated Mutual Knowledge Transfer for Large and SmallLanguage Models\"](https://arxiv.org/pdf/2406.02224), We integrate its code into the FATE-LLM framework. \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiments\n", "\n", "Chapter List: \n", "* settings\n", " 1. DataSet: ARC-Challenge\n", " 2. Models Use in \"FEDMKT\" Paper\n", " 3. Prepare Optimal Vocabulary Mapping Tables\n", " 4. Training LLMs with Lora\n", "* experiment examples:\n", " 1. Running FEDMKT With Launcher (Experimential Using): 4-SLMs\n", " 2. Running FEDMKT With Launcher (Experimential Using): 1-SLM (One To One)\n", " 3. Running FEDMKT With Launcher (Experimential Using): 1-SLM And SLM Trains Only (LLM2SLM)\n", " 4. Running FEDMKT With Launcher (Experimential Using): 4-SLMs Homogeneous SFT\n", " 5. Running FEDMKT with Pipeline (Industrial Using)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset: ARC-Challenge\n", "\n", "ARC-Challenge is a dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in advanced question-answering. \n", "\n", "You can refer to following link for more details about [ARC-Challange](https://huggingface.co/datasets/allenai/ai2_arc)\n", "\n", "In this section, we will download ARC-Challenge dataset from huggingface and splits it into five parts, part \"common\" for public dataset and other parts for slms(opt2, gpt2, llama, opt)'s training. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import datasets\n", "\n", "\n", "data = datasets.load_dataset(\"ai2_arc\", \"ARC-Challenge\", download_mode=\"force_redownload\", ignore_verifications=True)\n", "train_data = data.pop(\"train\")\n", "\n", "seed=123\n", "n = train_data.shape[0]\n", "client_num = 4\n", "process_data_output_dir = \"\" # processed data saved directory should be specified, it will be used in later.\n", "\n", "client_data_num = n // (client_num + 1)\n", "\n", "for i in range(client_num):\n", " splits = train_data.train_test_split(train_size=client_data_num, shuffle=True, seed=seed)\n", " client_name = f\"client_{i}\"\n", " data[client_name] = splits[\"train\"]\n", " train_data = splits[\"test\"]\n", "\n", "if train_data.shape[0] == client_data_num:\n", " data[\"common\"] = train_data\n", "else:\n", " data[\"common\"] = train_data.train_test_split(\n", " train_size=client_data_num, shuffle=True, seed=args.seed\n", " )[\"train\"]\n", "\n", "data.save_to_disk(process_data_output_dir)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Models Use In \"FEDMKT\" Paper\n", "\n", "LLM: [Llama-2-7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) \n", "SLM-0: [opt-1.3b](https://huggingface.co/facebook/opt-1.3b) \n", "SLM-1: [gpt2-xlarge](https://huggingface.co/openai-community/gpt2-xl) \n", "SLM-2: [Llama-1.3b](https://huggingface.co/princeton-nlp/Sheared-LLaMA-1.3B) \n", "SLM-3: [bloom-1.1B](https://huggingface.co/bigscience/bloom-1b1)\n", "\n", "Users should download the models from huggingface before the following steps and saved them in local directories, as models are too big, redownload them cost too much times.\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# replaoce the names of models to local save directories\n", "llm_pretrained_path = \"llama-2-7b-hf\"\n", "slm_0_pretrained_path = \"opt-1.3b\"\n", "slm_1_pretrained_path = \"gpt2-xl\"\n", "slm_2_pretrained_path = \"Sheared-LLaMA-1.3B\"\n", "slm_3_pretrained_path = \"bloom-1b1\"\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare Optimal Vocabulary Mapping Tables\n", "\n", "To use \"FEDMKT\" for federated knowledge transfer, we need to build pptimal vocabulary mapping tables first.\n", "In paper of \"FEDMKT\", it has One LLM and four SLMs, so we need to build eight pptimal vocabulary mapping tables. For each paired of (LLM, SLM), two tables should be built as co-training are needed.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.fedmkt.token_alignment.vocab_mapping import get_vocab_mappings\n", "\n", "\n", "llm_slm_pairs = [\n", " (llm_pretrained_path, slm_0_pretrained_path),\n", " (llm_pretrained_path, slm_1_pretrained_path),\n", " (llm_pretrained_path, slm_2_pretrained_path),\n", " (llm_pretrained_path, slm_3_pretrained_path)\n", "]\n", "\n", "vocab_mapping_directory = \"\" # replace this to actually paths\n", "\n", "slm_to_llm_vocab_mapping_paths = [\"opt_to_llama.json\", \"gpt2_to_llama.json\", \"llama_small_to_llama.json\", \"bloom_to_llama.json\"]\n", "llm_to_slm_vocab_mapping_paths = [\"llama_to_opt.json\", \"llama_to_gpt2.json\", \"llama_to_llama_small\", \"llama_to_bloom.json\"]\n", "\n", "for idx in range(4):\n", " slm_to_llm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + slm_to_llm_vocab_mapping_paths[idx]\n", " llm_to_slm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + llm_to_slm_vocab_mapping_paths[idx]\n", "\n", "for idx, (llm_pretrained, slm_pretrained) in enumerate(llm_slm_pairs):\n", " slm_to_llm_vocab_mapping_path = slm_to_llm_vocab_mapping_paths[idx]\n", " llm_to_slm_vocab_mapping_path = llm_to_slm_vocab_mapping_paths[idx]\n", " _ = get_vocab_mappings(slm_pretrained, llm_pretrained, slm_to_llm_vocab_mapping_paths[idx], num_processors=16)\n", " _ = get_vocab_mappings(llm_pretrained, slm_pretrained, llm_to_slm_vocab_mapping_paths[idx], num_processors=16)\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training LLMs with Lora\n", "\n", "In this section, We will introduce the lora configs use in five models listed in paper: one LLM (Llama-2-7B), four SLMs(opt-1.3B, gpt2-xlarge, Llama-1.3B, bloom-1.1B)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "LLM models with peft is located on fate_llm/model_zoo, we will give a guide to use them. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Init LLm Llama-2-7B's Lora Config" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']\n", ")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Init SLMs Lora Config" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "slm_pretrained_paths = [slm_0_pretrained_path, slm_1_pretrained_path, slm_2_pretrained_path, slm_3_pretrained_path]\n", "slm_lora_target_modules = [\n", " [\"q_proj\", \"v_proj\"],\n", " [\"c_attn\"],\n", " ['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n", " [\"query_key_value\"]\n", "]\n", "\n", "def get_slm_conf(slm_idx):\n", " slm_pretrained_path = slm_pretrained_paths[slm_idx]\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=slm_lora_target_modules[slm_idx]\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Running FEDMKT With Launcher (Experimential Using): 4-SLMs\n", "\n", "Using launcher to startup is mainly for experimential. Before running this section, make sure that [FATE-LLM Standalone](https://github.com/FederatedAI/FATE-LLM?tab=readme-ov-file#standalone-deployment) has been deployed." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Global Settings" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "process_data_output_dir = \"\"\n", "llm_pretrained_path = \"Llama-2-7b-hf\"\n", "slm_0_pretrained_path = \"opt-1.3b\"\n", "slm_1_pretrained_path = \"gpt2-xl\"\n", "slm_2_pretrained_path = \"Sheared-LLaMa-1.3B\"\n", "slm_3_pretrained_path = \"bloom-1b1\"\n", "llm_slm_pairs = [\n", " (llm_pretrained_path, slm_0_pretrained_path),\n", " (llm_pretrained_path, slm_1_pretrained_path),\n", " (llm_pretrained_path, slm_2_pretrained_path),\n", " (llm_pretrained_path, slm_3_pretrained_path)\n", "]\n", "\n", "vocab_mapping_directory = \"\"\n", "\n", "slm_to_llm_vocab_mapping_paths = [\"opt_to_llama.json\", \"gpt2_to_llama.json\", \"llama_small_to_llama.json\", \"bloom_to_llama.json\"]\n", "llm_to_slm_vocab_mapping_paths = [\"llama_to_opt.json\", \"llama_to_gpt2.json\", \"llama_to_llama_small\", \"llama_to_bloom.json\"]\n", "\n", "for idx in range(4):\n", " slm_to_llm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + slm_to_llm_vocab_mapping_paths[idx]\n", " llm_to_slm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + llm_to_slm_vocab_mapping_paths[idx]\n", "\n", "#### all variables has been defined above\n", "\n", "slm_pretrained_paths = [slm_0_pretrained_path, slm_1_pretrained_path, slm_2_pretrained_path, slm_3_pretrained_path]\n", "slm_lora_target_modules = [\n", " [\"q_proj\", \"v_proj\"],\n", " [\"c_attn\"],\n", " ['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n", " [\"query_key_value\"]\n", "]\n", "\n", "global_epochs = 1\n", "batch_size=4\n", "llm_lr = 3e-5\n", "slm_lrs = [3e-5, 3e-4, 3e-5, 3e-5, 3e-5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Init FEDMKTLLM Runner" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "In this Section, we will introduce how to initialize \"FEDMKTLLM\" object." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Step1: Initialize LLM With LoraConfig" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from peft import LoraConfig, TaskType\n", "from fate_llm.model_zoo.pellm.llama import LLaMa\n", "from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTLLM\n", "from fate.ml.nn.homo.fedavg import FedAVGArguments\n", "from fate_llm.dataset.qa_dataset import QaDataset\n", "from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", "from transformers import AutoConfig\n", "\n", "lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']\n", ")\n", "\n", "model = LLaMa(\n", " pretrained_path=llm_pretrained_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\" \n", ")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Step2: Specify Public Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pub_data = QaDataset(tokenizer_name_or_path=llm_pretrained_path,\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", "pub_data.load(process_data_output_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Step3: Initialize FEDMKT Training Args" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=llm_lr,\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(llm_pretrained_path).vocab_size, # pay attention to this, \n", " # vocab_size must be specified to avoid dimension mismatch \n", " # of tokenizer's vocab_size\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Step4: Initialize Other Variables" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fed_args = FedAVGArguments(\n", " aggregate_strategy='epoch',\n", " aggregate_freq=1\n", ")\n", "\n", "slm_to_llm_vocab_mapping = []\n", "for path in slm_to_llm_vocab_mapping_paths:\n", " with open(path, \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", " slm_to_llm_vocab_mapping.append(vocab_mapping)\n", "\n", "slm_tokenizers = [get_tokenizer(slm_pretrained_path) for slm_pretrained_path in slm_pretrained_paths]\n", "tokenizer = get_tokenizer(llm_pretrained_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Step5: New FEDMKTLLM Object" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer = FedMKTLLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " fed_args=fed_args,\n", " train_set=pub_data,\n", " tokenizer=tokenizer,\n", " slm_tokenizers=slm_tokenizers,\n", " slm_to_llm_vocab_mappings=slm_to_llm_vocab_mapping,\n", " save_trainable_weights_only=True, # save lora weights only\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Step6: Training And Save Results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer.train()\n", "trainer.save_model(output_dir=\"fill the path to save llm finetuning result\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Init FEDMKTSLM Runner\n", "\n", "FEDMKTSLM Runner is a slightly different of FEDMKTLLM Runner, we only introduce different variables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Import SLMs you need to run, here we choose four Slms Using In Original Paper." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import transformers\n", "from peft import LoraConfig, TaskType \n", "from fate_llm.model_zoo.pellm.llama import LLaMa\n", "from fate_llm.model_zoo.pellm.gpt2 import GPT2CLM\n", "from fate_llm.model_zoo.pellm.opt import OPT\n", "from fate_llm.model_zoo.pellm.bloom import Bloom\n", "from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTSLM\n", "from fate_llm.dataset.qa_dataset import QaDataset\n", "from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", "from transformers import AutoConfig\n", "\n", "slm_idx = 0\n", "\n", "slm_model_class = [\n", " OPT,\n", " GPT2CLM,\n", " LLaMa,\n", " Bloom\n", "]\n", " \n", "lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=slm_lora_target_modules[slm_idx]\n", ")\n", "\n", "model = slm_model_class[slm_idx](\n", " pretrained_path=slm_pretrained_paths[slm_idx],\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Specify Private Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "priv_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=f\"client_{slm_idx}\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", "priv_data.load(process_data_output_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Other Variables " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer = get_tokenizer(slm_pretrained_paths[slm_idx])\n", "\n", "import json\n", "with open(llm_to_slm_vocab_mapping_paths[slm_idx], \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### New FEDMKTSLM Object" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer = FedMKTSLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " fed_args=fed_args,\n", " pub_train_set=pub_data,\n", " priv_train_set=priv_data,\n", " tokenizer=tokenizer,\n", " save_trainable_weights_only=True, # save lora weights only\n", " llm_tokenizer=get_tokenizer(llm_pretrained_path), # different with LLM setting\n", " llm_to_slm_vocab_mapping=vocab_mapping, # different with LLM setting\n", " data_collator=transformers.DataCollatorForSeq2Seq(tokenizer) # use to train private dataset\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Complete Code To DO SFT With 4 SLMs\n", "\n", "Please paste the code in \"fedmkt_4_slms.py\" and execute it with the following command" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```python\n", "python fedmkt_4_slms.py --parties guest:9999 host:9999 host:10000 host:10001 arbiter:9999 --log_level INFO\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# fedmkt_4_slms.py\n", "\n", "import os\n", "\n", "from fate.arch import Context\n", "from fate.arch.launchers.multiprocess_launcher import launch\n", "import json\n", "\n", "process_data_output_dir = \"\"\n", "llm_pretrained_path = \"Llama-2-7b-hf\"\n", "slm_0_pretrained_path = \"opt-1.3b\"\n", "slm_1_pretrained_path = \"gpt2-xl\"\n", "slm_2_pretrained_path = \"Sheared-LLaMa-1.3B\"\n", "slm_3_pretrained_path = \"bloom-1b1\"\n", "llm_slm_pairs = [\n", " (llm_pretrained_path, slm_0_pretrained_path),\n", " (llm_pretrained_path, slm_1_pretrained_path),\n", " (llm_pretrained_path, slm_2_pretrained_path),\n", " (llm_pretrained_path, slm_3_pretrained_path)\n", "]\n", "\n", "vocab_mapping_directory = \"\"\n", "\n", "slm_to_llm_vocab_mapping_paths = [\"opt_to_llama.json\", \"gpt2_to_llama.json\", \"llama_small_to_llama.json\", \"bloom_to_llama.json\"]\n", "llm_to_slm_vocab_mapping_paths = [\"llama_to_opt.json\", \"llama_to_gpt2.json\", \"llama_to_llama_small\", \"llama_to_bloom.json\"]\n", "\n", "for idx in range(4):\n", " slm_to_llm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + slm_to_llm_vocab_mapping_paths[idx]\n", " llm_to_slm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + llm_to_slm_vocab_mapping_paths[idx]\n", "\n", "slm_pretrained_paths = [slm_0_pretrained_path, slm_1_pretrained_path, slm_2_pretrained_path, slm_3_pretrained_path]\n", "slm_lora_target_modules = [\n", " [\"q_proj\", \"v_proj\"],\n", " [\"c_attn\"],\n", " ['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n", " [\"query_key_value\"]\n", "]\n", "\n", "global_epochs = 5\n", "batch_size=4\n", "llm_lr = 3e-5\n", "slm_lrs = [3e-5, 3e-4, 3e-5, 3e-5, 3e-5]\n", "\n", "llm_model_saved_directory = \"./models/fedmkt_4_slms_llm_model\"\n", "slm_models_saved_directory = [\n", " \"./models/fedmkt_4_slms_slm_0\", \n", " \"./models/fedmkt_4_slms_slm_1\", \n", " \"./models/fedmkt_4_slms_slm_2\", \n", " \"./models/fedmkt_4_slms_slm_3\"\n", "]\n", "\n", "\n", "def train_llm(ctx):\n", " from peft import LoraConfig, TaskType\n", " from fate_llm.model_zoo.pellm.llama import LLaMa\n", " from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTLLM\n", " from fate_llm.dataset.qa_dataset import QaDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers import AutoConfig\n", "\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']\n", " )\n", "\n", " model = LLaMa(\n", " pretrained_path=llm_pretrained_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " pub_data = QaDataset(tokenizer_name_or_path=llm_pretrained_path,\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " pub_data.load(process_data_output_dir)\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=llm_lr,\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(llm_pretrained_path).vocab_size,\n", " )\n", "\n", " slm_to_llm_vocab_mapping = []\n", " for path in slm_to_llm_vocab_mapping_paths:\n", " with open(path, \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", " slm_to_llm_vocab_mapping.append(vocab_mapping)\n", "\n", " slm_tokenizers = [get_tokenizer(slm_pretrained_path) for slm_pretrained_path in slm_pretrained_paths]\n", "\n", " tokenizer = get_tokenizer(llm_pretrained_path)\n", " trainer = FedMKTLLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " train_set=pub_data,\n", " tokenizer=tokenizer,\n", " slm_tokenizers=slm_tokenizers,\n", " slm_to_llm_vocab_mappings=slm_to_llm_vocab_mapping,\n", " save_trainable_weights_only=True,\n", " )\n", "\n", " trainer.train()\n", " trainer.save_model(llm_model_saved_directory)\n", "\n", "\n", "def train_slm(ctx, slm_idx):\n", " import transformers\n", " from peft import LoraConfig, TaskType\n", " from fate_llm.model_zoo.pellm.llama import LLaMa\n", " from fate_llm.model_zoo.pellm.gpt2 import GPT2CLM\n", " from fate_llm.model_zoo.pellm.opt import OPT\n", " from fate_llm.model_zoo.pellm.bloom import Bloom\n", " from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTSLM\n", " from fate_llm.dataset.qa_dataset import QaDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers import AutoConfig\n", "\n", " slm_model_class = [\n", " OPT,\n", " GPT2CLM,\n", " LLaMa,\n", " Bloom\n", " ]\n", "\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=slm_lora_target_modules[slm_idx]\n", " )\n", "\n", " model = slm_model_class[slm_idx](\n", " pretrained_path=slm_pretrained_paths[slm_idx],\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " priv_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=f\"client_{slm_idx}\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " priv_data.load(process_data_output_dir)\n", "\n", " pub_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " pub_data.load(process_data_output_dir)\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=slm_lrs[slm_idx],\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(slm_pretrained_paths[slm_idx]).vocab_size,\n", " )\n", "\n", " tokenizer = get_tokenizer(slm_pretrained_paths[slm_idx])\n", "\n", " import json\n", " with open(llm_to_slm_vocab_mapping_paths[slm_idx], \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", "\n", " trainer = FedMKTSLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " pub_train_set=pub_data,\n", " priv_train_set=priv_data,\n", " tokenizer=tokenizer,\n", " save_trainable_weights_only=True,\n", " llm_tokenizer=get_tokenizer(llm_pretrained_path),\n", " llm_to_slm_vocab_mapping=vocab_mapping,\n", " data_collator=transformers.DataCollatorForSeq2Seq(tokenizer)\n", " )\n", "\n", " trainer.train()\n", " trainer.save_model(slm_models_saved_directory[slm_idx])\n", "\n", "\n", "def run(ctx: Context):\n", " if ctx.is_on_arbiter:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", " train_llm(ctx)\n", " elif ctx.is_on_guest:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n", " train_slm(ctx, slm_idx=0)\n", " else:\n", " if ctx.local.party[1] == \"9999\":\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"\n", " slm_idx = 1\n", " elif ctx.local.party[1] == \"10000\":\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n", " slm_idx = 2\n", " elif ctx.local.party[1] == \"10001\":\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"4\"\n", " slm_idx = 3\n", " else:\n", " raise ValueError(f\"party_id={ctx.local.party[1]} is illegal\")\n", "\n", " train_slm(ctx, slm_idx=slm_idx)\n", "\n", "\n", "if __name__ == \"__main__\":\n", " launch(run)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Running FEDMKT With Launcher (Experimential Using): 1-SLM (One To One)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Actually, a slightly modifications from 4-SLMs running code are enough to do sft with single clients, it will be listed in below sections, we take SLM-0(OPT) as an example" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Only Use Single Optimal Vocabulary Mapping Tables" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "slm_idx = 0\n", "slm_to_llm_vocab_mapping = []\n", "with open(slm_to_llm_vocab_mapping_paths[slm_idx], \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", " slm_to_llm_vocab_mapping.append(vocab_mapping)\n", "\n", "slm_tokenizers = [get_tokenizer(slm_pretrained_paths[slm_idx])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Complete Code To DO SFT With 1 SLM\n", "\n", "Please paste the code in \"fedmkt_1_slm.py\" and execute it with the following command" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```python\n", "python fedmkt_1_slm.py --parties guest:9999 arbiter:9999 --log_level INFO\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# fedmkt_1_slm.py\n", "\n", "import os\n", "\n", "from fate.arch import Context\n", "from fate.arch.launchers.multiprocess_launcher import launch\n", "import json\n", "\n", "process_data_output_dir = \"\"\n", "llm_pretrained_path = \"Llama-2-7b-hf\"\n", "slm_0_pretrained_path = \"opt-1.3b\"\n", "slm_1_pretrained_path = \"gpt2-xl\"\n", "slm_2_pretrained_path = \"Sheared-LLaMa-1.3B\"\n", "slm_3_pretrained_path = \"bloom-1b1\"\n", "llm_slm_pairs = [\n", " (llm_pretrained_path, slm_0_pretrained_path),\n", " (llm_pretrained_path, slm_1_pretrained_path),\n", " (llm_pretrained_path, slm_2_pretrained_path),\n", " (llm_pretrained_path, slm_3_pretrained_path)\n", "]\n", "\n", "vocab_mapping_directory = \"\"\n", "\n", "slm_to_llm_vocab_mapping_paths = [\"opt_to_llama.json\", \"gpt2_to_llama.json\", \"llama_small_to_llama.json\", \"bloom_to_llama.json\"]\n", "llm_to_slm_vocab_mapping_paths = [\"llama_to_opt.json\", \"llama_to_gpt2.json\", \"llama_to_llama_small\", \"llama_to_bloom.json\"]\n", "\n", "for idx in range(4):\n", " slm_to_llm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + slm_to_llm_vocab_mapping_paths[idx]\n", " llm_to_slm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + llm_to_slm_vocab_mapping_paths[idx]\n", "\n", "slm_pretrained_paths = [slm_0_pretrained_path, slm_1_pretrained_path, slm_2_pretrained_path, slm_3_pretrained_path]\n", "slm_lora_target_modules = [\n", " [\"q_proj\", \"v_proj\"],\n", " [\"c_attn\"],\n", " ['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n", " [\"query_key_value\"]\n", "]\n", "\n", "global_epochs = 5\n", "batch_size = 4\n", "llm_lr = 3e-5\n", "slm_lrs = [3e-5]\n", "\n", "llm_model_saved_directory = \"./models/fedmkt_single_slm_llm\"\n", "slm_models_saved_directory = [\n", " \"./models/fedmkt_single_slm_opt\",\n", "]\n", "\n", "\n", "def train_llm(ctx, slm_idx):\n", " from peft import LoraConfig, TaskType\n", " from fate_llm.model_zoo.pellm.llama import LLaMa\n", " from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTLLM\n", " from fate_llm.dataset.qa_dataset import QaDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers import AutoConfig\n", "\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']\n", " )\n", "\n", " model = LLaMa(\n", " pretrained_path=llm_pretrained_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " pub_data = QaDataset(tokenizer_name_or_path=llm_pretrained_path,\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " pub_data.load(process_data_output_dir)\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=llm_lr,\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(llm_pretrained_path).vocab_size,\n", " )\n", "\n", " slm_to_llm_vocab_mapping = []\n", " with open(slm_to_llm_vocab_mapping_paths[slm_idx], \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", " slm_to_llm_vocab_mapping.append(vocab_mapping)\n", "\n", " slm_tokenizers = [get_tokenizer(slm_pretrained_paths[slm_idx])]\n", "\n", " tokenizer = get_tokenizer(llm_pretrained_path)\n", " trainer = FedMKTLLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " train_set=pub_data,\n", " tokenizer=tokenizer,\n", " slm_tokenizers=slm_tokenizers,\n", " slm_to_llm_vocab_mappings=slm_to_llm_vocab_mapping,\n", " save_trainable_weights_only=True,\n", " )\n", "\n", " trainer.train()\n", " trainer.save_model(llm_model_saved_directory)\n", "\n", "\n", "def train_slm(ctx, slm_idx):\n", " import transformers\n", " from peft import LoraConfig, TaskType\n", " from fate_llm.model_zoo.pellm.llama import LLaMa\n", " from fate_llm.model_zoo.pellm.gpt2 import GPT2CLM\n", " from fate_llm.model_zoo.pellm.opt import OPT\n", " from fate_llm.model_zoo.pellm.bloom import Bloom\n", " from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTSLM\n", " from fate_llm.dataset.qa_dataset import QaDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers import AutoConfig\n", "\n", " slm_model_class = [\n", " OPT,\n", " GPT2CLM,\n", " LLaMa,\n", " Bloom\n", " ]\n", "\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=slm_lora_target_modules[slm_idx]\n", " )\n", "\n", " model = slm_model_class[slm_idx](\n", " pretrained_path=slm_pretrained_paths[slm_idx],\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " priv_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=f\"client_{slm_idx}\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " priv_data.load(process_data_output_dir)\n", "\n", " pub_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " pub_data.load(process_data_output_dir)\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=slm_lrs[slm_idx],\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(slm_pretrained_paths[slm_idx]).vocab_size,\n", " )\n", "\n", " tokenizer = get_tokenizer(slm_pretrained_paths[slm_idx])\n", "\n", " import json\n", " with open(llm_to_slm_vocab_mapping_paths[slm_idx], \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", "\n", " trainer = FedMKTSLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " pub_train_set=pub_data,\n", " priv_train_set=priv_data,\n", " tokenizer=tokenizer,\n", " save_trainable_weights_only=True,\n", " llm_tokenizer=get_tokenizer(llm_pretrained_path),\n", " llm_to_slm_vocab_mapping=vocab_mapping,\n", " data_collator=transformers.DataCollatorForSeq2Seq(tokenizer)\n", " )\n", "\n", " trainer.train()\n", " trainer.save_model(slm_models_saved_directory[slm_idx])\n", "\n", "\n", "def run(ctx: Context):\n", " if ctx.is_on_arbiter:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", " train_llm(ctx, slm_idx=0)\n", " else:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n", " train_slm(ctx, slm_idx=0)\n", "\n", "\n", "if __name__ == \"__main__\":\n", " launch(run)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Running FEDMKT With Launcher (Experimential Using): 1-SLM And SLM Trains Only (LLM2SLM)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this section, we introduce how to do SFT using FEDMKT algorithm, with only single SLM are trained, but without LLM training, means that SLM distill knowlege from LLM only, not co-training." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Difference With Section \"Running FEDMKT With Launcher (Experimential Using): 1-SLMs\"\n", "\n", "Add llm_training=False to fedmkt_training_args to both LLM and LLM is enough!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Complete Code To DO SFT With 1 SLM And SLM Trains Only\n", "\n", "Please paste the code in \"fedmkt_llm_to_slm.py\" and execute it with the following command" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```python\n", "python fedmkt_llm_to_slm.py --parties guest:9999 arbiter:9999 --log_level INFO\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# fedmkt_llm_to_slm.py\n", "\n", "import os\n", "\n", "from fate.arch import Context\n", "from fate.arch.launchers.multiprocess_launcher import launch\n", "import json\n", "\n", "process_data_output_dir = \"\"\n", "llm_pretrained_path = \"Llama-2-7b-hf\"\n", "slm_0_pretrained_path = \"opt-1.3b\"\n", "slm_1_pretrained_path = \"gpt2-xl\"\n", "slm_2_pretrained_path = \"Sheared-LLaMa-1.3B\"\n", "slm_3_pretrained_path = \"bloom-1b1\"\n", "llm_slm_pairs = [\n", " (llm_pretrained_path, slm_0_pretrained_path),\n", " (llm_pretrained_path, slm_1_pretrained_path),\n", " (llm_pretrained_path, slm_2_pretrained_path),\n", " (llm_pretrained_path, slm_3_pretrained_path)\n", "]\n", "\n", "vocab_mapping_directory = \"\"\n", "\n", "slm_to_llm_vocab_mapping_paths = [\"opt_to_llama.json\", \"gpt2_to_llama.json\", \"llama_small_to_llama.json\", \"bloom_to_llama.json\"]\n", "llm_to_slm_vocab_mapping_paths = [\"llama_to_opt.json\", \"llama_to_gpt2.json\", \"llama_to_llama_small\", \"llama_to_bloom.json\"]\n", "\n", "for idx in range(4):\n", " slm_to_llm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + slm_to_llm_vocab_mapping_paths[idx]\n", " llm_to_slm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + llm_to_slm_vocab_mapping_paths[idx]\n", "\n", "slm_pretrained_paths = [slm_0_pretrained_path, slm_1_pretrained_path, slm_2_pretrained_path, slm_3_pretrained_path]\n", "slm_lora_target_modules = [\n", " [\"q_proj\", \"v_proj\"],\n", " [\"c_attn\"],\n", " ['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n", " [\"query_key_value\"]\n", "]\n", "\n", "global_epochs = 5\n", "batch_size = 4\n", "llm_lr = 3e-5\n", "slm_lrs = [3e-5]\n", "\n", "slm_models_saved_directory = [\n", " \"./models/fedmkt_llm_to_slm_opt\",\n", "]\n", "\n", "\n", "def train_llm(ctx, slm_idx):\n", " from peft import LoraConfig, TaskType\n", " from fate_llm.model_zoo.pellm.llama import LLaMa\n", " from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTLLM\n", " from fate_llm.dataset.qa_dataset import QaDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers import AutoConfig\n", "\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']\n", " )\n", "\n", " model = LLaMa(\n", " pretrained_path=llm_pretrained_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " pub_data = QaDataset(tokenizer_name_or_path=llm_pretrained_path,\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " pub_data.load(process_data_output_dir)\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=llm_lr,\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(llm_pretrained_path).vocab_size,\n", " llm_training=False\n", " )\n", "\n", " slm_to_llm_vocab_mapping = []\n", " with open(slm_to_llm_vocab_mapping_paths[slm_idx], \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", " slm_to_llm_vocab_mapping.append(vocab_mapping)\n", "\n", " slm_tokenizers = [get_tokenizer(slm_pretrained_paths[slm_idx])]\n", "\n", " tokenizer = get_tokenizer(llm_pretrained_path)\n", " trainer = FedMKTLLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " train_set=pub_data,\n", " tokenizer=tokenizer,\n", " slm_tokenizers=slm_tokenizers,\n", " slm_to_llm_vocab_mappings=slm_to_llm_vocab_mapping,\n", " save_trainable_weights_only=True,\n", " )\n", "\n", " trainer.train()\n", "\n", "\n", "def train_slm(ctx, slm_idx):\n", " import transformers\n", " from peft import LoraConfig, TaskType\n", " from fate_llm.model_zoo.pellm.llama import LLaMa\n", " from fate_llm.model_zoo.pellm.gpt2 import GPT2CLM\n", " from fate_llm.model_zoo.pellm.opt import OPT\n", " from fate_llm.model_zoo.pellm.bloom import Bloom\n", " from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTSLM\n", " from fate_llm.dataset.qa_dataset import QaDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers import AutoConfig\n", "\n", " slm_model_class = [\n", " OPT,\n", " GPT2CLM,\n", " LLaMa,\n", " Bloom\n", " ]\n", "\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1,\n", " target_modules=slm_lora_target_modules[slm_idx]\n", " )\n", "\n", " model = slm_model_class[slm_idx](\n", " pretrained_path=slm_pretrained_paths[slm_idx],\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " priv_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=f\"client_{slm_idx}\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " priv_data.load(process_data_output_dir)\n", "\n", " pub_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " pub_data.load(process_data_output_dir)\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=slm_lrs[slm_idx],\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(slm_pretrained_paths[slm_idx]).vocab_size,\n", " llm_training=False\n", " )\n", "\n", " tokenizer = get_tokenizer(slm_pretrained_paths[slm_idx])\n", "\n", " import json\n", " with open(llm_to_slm_vocab_mapping_paths[slm_idx], \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", "\n", " trainer = FedMKTSLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " pub_train_set=pub_data,\n", " priv_train_set=priv_data,\n", " tokenizer=tokenizer,\n", " save_trainable_weights_only=True,\n", " llm_tokenizer=get_tokenizer(llm_pretrained_path),\n", " llm_to_slm_vocab_mapping=vocab_mapping,\n", " data_collator=transformers.DataCollatorForSeq2Seq(tokenizer)\n", " )\n", "\n", " trainer.train()\n", " trainer.save_model(slm_models_saved_directory[slm_idx])\n", "\n", "\n", "def run(ctx: Context):\n", " if ctx.is_on_arbiter:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", " train_llm(ctx, slm_idx=0)\n", " else:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n", " train_slm(ctx, slm_idx=0)\n", "\n", "\n", "if __name__ == \"__main__\":\n", " launch(run)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Running FEDMKT With Launcher (Experimential Using): 4-SLMs Homogeneous SFT\n", "\n", "To run homogeneous experiments, two steps are needed.\n", "1. add post_fedavg=True to fedmkt_training_args to both LLM and LLM is enough!\n", "2. add fed_args to FEDMKTLLM/FEDMKTSLM" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# initialze fed args\n", "from fate.ml.nn.homo.fedavg import FedAVGArguments\n", "\n", "fed_args = FedAVGArguments(\n", " aggregate_strategy='epoch',\n", " aggregate_freq=1\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Complete Code To DO SFT With 4-SLMs Homogeneous SFT\n", "\n", "Please paste the code in \"fedmkt_4_slms_homo.py\" and execute it with the following command" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```python\n", "python fedmkt_4_slms_homo.py --parties guest:9999 host:9999 host:10000 host:10001 arbiter:9999 --log_level INFO\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# fedmkt_4_slms_homo.py\n", "\n", "import os\n", "\n", "from fate.arch import Context\n", "from fate.arch.launchers.multiprocess_launcher import launch\n", "import json\n", "\n", "process_data_output_dir = \"\"\n", "llm_pretrained_path = \"Llama-2-7b-hf\"\n", "slm_0_pretrained_path = \"opt-1.3b\"\n", "slm_1_pretrained_path = \"opt-1.3b\"\n", "slm_2_pretrained_path = \"opt-1.3b\"\n", "slm_3_pretrained_path = \"opt-1.3b\"\n", "llm_slm_pairs = [\n", " (llm_pretrained_path, slm_0_pretrained_path),\n", " (llm_pretrained_path, slm_1_pretrained_path),\n", " (llm_pretrained_path, slm_2_pretrained_path),\n", " (llm_pretrained_path, slm_3_pretrained_path)\n", "]\n", "\n", "vocab_mapping_directory = \"\"\n", "\n", "slm_to_llm_vocab_mapping_paths = [\"opt_to_llama.json\"] * 4\n", "llm_to_slm_vocab_mapping_paths = [\"llama_to_opt.json\"] * 4\n", "\n", "for idx in range(4):\n", " slm_to_llm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + slm_to_llm_vocab_mapping_paths[idx]\n", " llm_to_slm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + llm_to_slm_vocab_mapping_paths[idx]\n", "\n", "slm_pretrained_paths = [slm_0_pretrained_path] * 4\n", "slm_lora_target_modules = [[\"q_proj\", \"v_proj\"]] * 4\n", "\n", "global_epochs = 5\n", "batch_size = 4\n", "llm_lr = 3e-5\n", "slm_lrs = [3e-5, 3e-5, 3e-5, 3e-5, 3e-5]\n", "\n", "llm_model_saved_directory = \"./models/fedmkt_homo_4_slms_llm_model\"\n", "slm_models_saved_directory = [\n", " \"./models/fedmkt_homo_4_slms_slm_0\",\n", "]\n", "\n", "\n", "def train_llm(ctx):\n", " from peft import LoraConfig, TaskType\n", " from fate_llm.model_zoo.pellm.llama import LLaMa\n", " from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTLLM\n", " from fate.ml.nn.homo.fedavg import FedAVGArguments\n", " from fate_llm.dataset.qa_dataset import QaDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers import AutoConfig\n", "\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n", " target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']\n", " )\n", "\n", " model = LLaMa(\n", " pretrained_path=llm_pretrained_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " pub_data = QaDataset(tokenizer_name_or_path=llm_pretrained_path,\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " pub_data.load(process_data_output_dir)\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=llm_lr,\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(llm_pretrained_path).vocab_size,\n", " post_fedavg=True, # difference\n", " )\n", "\n", " # difference\n", " fed_args = FedAVGArguments(\n", " aggregate_strategy='epoch',\n", " aggregate_freq=1\n", " )\n", "\n", " slm_to_llm_vocab_mapping = []\n", " for path in slm_to_llm_vocab_mapping_paths:\n", " with open(path, \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", " slm_to_llm_vocab_mapping.append(vocab_mapping)\n", "\n", " slm_tokenizers = [get_tokenizer(slm_pretrained_path) for slm_pretrained_path in slm_pretrained_paths]\n", "\n", " tokenizer = get_tokenizer(llm_pretrained_path)\n", " trainer = FedMKTLLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args,\n", " fed_args=fed_args, # difference\n", " train_set=pub_data,\n", " tokenizer=tokenizer,\n", " slm_tokenizers=slm_tokenizers,\n", " slm_to_llm_vocab_mappings=slm_to_llm_vocab_mapping,\n", " save_trainable_weights_only=True,\n", " )\n", "\n", " trainer.train()\n", " trainer.save_model(llm_model_saved_directory)\n", "\n", "\n", "def train_slm(ctx, slm_idx):\n", " import transformers\n", " from peft import LoraConfig, TaskType\n", " from fate_llm.model_zoo.pellm.opt import OPT\n", " from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTSLM\n", " from fate.ml.nn.homo.fedavg import FedAVGArguments\n", " from fate_llm.dataset.qa_dataset import QaDataset\n", " from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer\n", " from transformers import AutoConfig\n", "\n", " slm_model_class = [OPT] * 4\n", "\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n", " target_modules=slm_lora_target_modules[slm_idx]\n", " )\n", "\n", " model = slm_model_class[slm_idx](\n", " pretrained_path=slm_pretrained_paths[slm_idx],\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " priv_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=f\"client_{slm_idx}\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " priv_data.load(process_data_output_dir)\n", "\n", " pub_data = QaDataset(tokenizer_name_or_path=slm_pretrained_paths[slm_idx],\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512,\n", " need_preprocess=True)\n", " pub_data.load(process_data_output_dir)\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=global_epochs,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=batch_size,\n", " learning_rate=slm_lrs[slm_idx],\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(slm_pretrained_paths[slm_idx]).vocab_size,\n", " post_fedavg=True, # difference\n", " )\n", "\n", " # difference\n", " fed_args = FedAVGArguments(\n", " aggregate_strategy='epoch',\n", " aggregate_freq=1\n", " )\n", "\n", " tokenizer = get_tokenizer(slm_pretrained_paths[slm_idx])\n", "\n", " import json\n", " with open(llm_to_slm_vocab_mapping_paths[slm_idx], \"r\") as fin:\n", " vocab_mapping = json.loads(fin.read())\n", "\n", " trainer = FedMKTSLM(\n", " ctx=ctx,\n", " model=model,\n", " training_args=training_args, \n", " fed_args=fed_args, # difference\n", " pub_train_set=pub_data,\n", " priv_train_set=priv_data,\n", " tokenizer=tokenizer,\n", " save_trainable_weights_only=True,\n", " llm_tokenizer=get_tokenizer(llm_pretrained_path),\n", " llm_to_slm_vocab_mapping=vocab_mapping,\n", " data_collator=transformers.DataCollatorForSeq2Seq(tokenizer)\n", " )\n", "\n", " trainer.train()\n", " if slm_idx == 0:\n", " trainer.save_model(slm_models_saved_directory[slm_idx])\n", "\n", "\n", "def run(ctx: Context):\n", " if ctx.is_on_arbiter:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", " train_llm(ctx)\n", " elif ctx.is_on_guest:\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n", " train_slm(ctx, slm_idx=0)\n", " else:\n", " if ctx.local.party[1] == \"9999\":\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"\n", " slm_idx = 1\n", " elif ctx.local.party[1] == \"10000\":\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n", " slm_idx = 2\n", " elif ctx.local.party[1] == \"10001\":\n", " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"4\"\n", " slm_idx = 3\n", " else:\n", " raise ValueError(f\"party_id={ctx.local.party[1]} is illegal\")\n", "\n", " train_slm(ctx, slm_idx=slm_idx)\n", "\n", "\n", "if __name__ == \"__main__\":\n", " launch(run)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Running FEDMKT with Pipeline (Industrial Using)\n", "\n", "Please make sure that [FATE-LLM Cluster](https://github.com/FederatedAI/FATE/wiki/Download#llm%E9%83%A8%E7%BD%B2%E5%8C%85) has been deployed, ensure that multiple machines has been deployed in FATE-LLM Cluster mode, past the following code to test_fedmkt_4_slms.py, the execute \"python test_fedmkt_4_slms.py\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_fedmkt_runner\n", "from fate_client.pipeline.components.fate.nn.algo_params import FedMKTTrainingArguments, FedAVGArguments\n", "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n", "from peft import LoraConfig, TaskType\n", "from fate_client.pipeline import FateFlowPipeline\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from transformers import AutoConfig\n", "\n", "guest = '9999' # replace this party id to actual guest party id in your enviroment\n", "host = ['9999', '10000', '10001'] # replace host party ids in your enviroment\n", "arbiter = '9999' # replace this party id to actual arbiter party id in your enviroment\n", "\n", "\n", "process_data_output_dir = \"\" # replace this to actual process_data_output_dir\n", "# replaoce the names of models to local save directories\n", "llm_pretrained_path = \"llama-2-7b-hf\"\n", "slm_0_pretrained_path = \"opt-1.3b\"\n", "slm_1_pretrained_path = \"gpt2-xl\"\n", "slm_2_pretrained_path = \"Sheared-LLaMA-1.3B\"\n", "slm_3_pretrained_path = \"bloom-1b1\"\n", "llm_slm_pairs = [\n", " (llm_pretrained_path, slm_0_pretrained_path),\n", " (llm_pretrained_path, slm_1_pretrained_path),\n", " (llm_pretrained_path, slm_2_pretrained_path),\n", " (llm_pretrained_path, slm_3_pretrained_path)\n", "]\n", "\n", "vocab_mapping_directory = \"\" # reploace this to actual voacb_mapping_directory\n", "\n", "slm_to_llm_vocab_mapping_paths = [\"opt_to_llama.json\", \"gpt2_to_llama.json\", \"llama_small_to_llama.json\", \"bloom_to_llama.json\"]\n", "llm_to_slm_vocab_mapping_paths = [\"llama_to_opt.json\", \"llama_to_gpt2.json\", \"llama_to_llama_small\", \"llama_to_bloom.json\"]\n", "\n", "for idx in range(4):\n", " slm_to_llm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + slm_to_llm_vocab_mapping_paths[idx]\n", " llm_to_slm_vocab_mapping_paths[idx] = vocab_mapping_directory + \"/\" + llm_to_slm_vocab_mapping_paths[idx]\n", "\n", "slm_pretrained_paths = [slm_0_pretrained_path, slm_1_pretrained_path, slm_2_pretrained_path, slm_3_pretrained_path]\n", "slm_lora_target_modules = [\n", " [\"q_proj\", \"v_proj\"],\n", " [\"c_attn\"],\n", " ['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n", " [\"query_key_value\"]\n", "]\n", "slm_models = [\n", " (\"pellm.opt\", \"OPT\"),\n", " (\"pellm.gpt2\", \"GPT2CLM\"),\n", " (\"pellm.llama\", \"LLaMa\"),\n", " (\"pellm.bloom\", \"Bloom\")\n", "]\n", "\n", "\n", "def get_llm_conf():\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05,\n", " target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']\n", " )\n", " lora_config.target_modules = list(lora_config.target_modules)\n", "\n", " llm_model = LLMModelLoader(\n", " \"pellm.llama\",\n", " \"LLaMa\",\n", " pretrained_path=llm_pretrained_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " torch_dtype=\"bfloat16\"\n", " )\n", "\n", " pub_dataset = LLMDatasetLoader(\n", " \"qa_dataset\",\n", " \"QaDataset\",\n", " tokenizer_name_or_path=llm_pretrained_path,\n", " need_preprocess=True,\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512\n", " )\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=5,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=4,\n", " learning_rate=3e-5,\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=AutoConfig.from_pretrained(llm_pretrained_path).vocab_size,\n", " )\n", "\n", " fed_args = FedAVGArguments(\n", " aggregate_strategy='epoch',\n", " aggregate_freq=1\n", " )\n", "\n", " tokenizer = LLMDataFuncLoader(\n", " \"tokenizers.cust_tokenizer\",\n", " \"get_tokenizer\",\n", " tokenizer_name_or_path=llm_pretrained_path\n", " )\n", "\n", " slm_tokenizers = list()\n", " for slm_pretrained_path in slm_pretrained_paths:\n", " slm_tokenizers.append(\n", " LLMDataFuncLoader(\"tokenizers.cust_tokenizer\", \"get_tokenizer\", tokenizer_name_or_path=slm_pretrained_path)\n", " )\n", "\n", " return get_config_of_fedmkt_runner(\n", " model=llm_model,\n", " training_args=training_args,\n", " fed_args=fed_args,\n", " pub_dataset=pub_dataset,\n", " tokenizer=tokenizer,\n", " slm_tokenizers=slm_tokenizers,\n", " slm_to_llm_vocab_mapping_paths=slm_to_llm_vocab_mapping_paths,\n", " pub_dataset_path=process_data_output_dir,\n", " save_trainable_weights_only=True,\n", " )\n", "\n", "\n", "def get_slm_conf(slm_idx):\n", " slm_pretrained_path = slm_pretrained_paths[slm_idx]\n", " lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n", " target_modules=slm_lora_target_modules[slm_idx]\n", " )\n", " lora_config.target_modules = list(lora_config.target_modules)\n", " llm_to_slm_vocab_mapping = llm_to_slm_vocab_mapping_paths[slm_idx]\n", "\n", " slm_model = LLMModelLoader(\n", " slm_models[slm_idx][0],\n", " slm_models[slm_idx][1],\n", " pretrained_path=slm_pretrained_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " )\n", " vocab_size = AutoConfig.from_pretrained(slm_pretrained_path).vocab_size\n", "\n", " pub_dataset = LLMDatasetLoader(\n", " \"qa_dataset\",\n", " \"QaDataset\",\n", " tokenizer_name_or_path=slm_pretrained_path,\n", " need_preprocess=True,\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"common\",\n", " seq_max_len=512\n", " )\n", "\n", " priv_dataset = LLMDatasetLoader(\n", " \"qa_dataset\",\n", " \"QaDataset\",\n", " tokenizer_name_or_path=slm_pretrained_path,\n", " need_preprocess=True,\n", " dataset_name=\"arc_challenge\",\n", " data_part=\"client_0\",\n", " seq_max_len=512\n", " )\n", "\n", " training_args = FedMKTTrainingArguments(\n", " global_epochs=5,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=4,\n", " learning_rate=3e-5 if slm_idx != 1 else 3e-4\n", " output_dir=\"./\",\n", " dataloader_num_workers=4,\n", " remove_unused_columns=False,\n", " warmup_ratio=0.008,\n", " lr_scheduler_type=\"cosine\",\n", " optim=\"adamw_torch\",\n", " adam_beta1=0.9,\n", " adam_beta2=0.95,\n", " weight_decay=0.1,\n", " max_grad_norm=1.0,\n", " use_cpu=False,\n", " vocab_size=vocab_size,\n", " # post_fedavg=True,\n", " # llm_training=False,\n", " )\n", "\n", " fed_args = FedAVGArguments(\n", " aggregate_strategy='epoch',\n", " aggregate_freq=1\n", " )\n", "\n", " tokenizer = LLMDataFuncLoader(\n", " \"tokenizers.cust_tokenizer\",\n", " \"get_tokenizer\",\n", " tokenizer_name_or_path=slm_pretrained_path\n", " )\n", "\n", " llm_tokenizer = LLMDataFuncLoader(\n", " \"tokenizers.cust_tokenizer\", \"get_tokenizer\", tokenizer_name_or_path=llm_pretrained_path\n", " )\n", "\n", " data_collator = LLMDataFuncLoader(module_name='data_collator.cust_data_collator',\n", " item_name='get_seq2seq_data_collator', tokenizer_name_or_path=slm_pretrained_path)\n", "\n", " return get_config_of_fedmkt_runner(\n", " model=slm_model,\n", " training_args=training_args,\n", " fed_args=fed_args,\n", " pub_dataset=pub_dataset,\n", " priv_dataset=priv_dataset,\n", " tokenizer=tokenizer,\n", " llm_tokenizer=llm_tokenizer,\n", " llm_to_slm_vocab_mapping_path=llm_to_slm_vocab_mapping,\n", " pub_dataset_path=process_data_output_dir,\n", " save_trainable_weights_only=True,\n", " data_collator=data_collator\n", " )\n", "\n", "\n", "pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter, host=host)\n", "pipeline.bind_local_path(path=process_data_output_dir, namespace=\"experiment\", name=\"arc_challenge\")\n", "\n", "\n", "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest, host=host))\n", "reader_0.guest.task_parameters(\n", " namespace=\"experiment\",\n", " name=\"arc_challenge\"\n", ")\n", "reader_0.hosts[[0, 1, 2]].task_parameters(\n", " namespace=\"experiment\",\n", " name=\"arc_challenge\"\n", ")\n", "\n", "\n", "homo_nn_0 = HomoNN(\n", " 'nn_0',\n", " train_data=reader_0.outputs[\"output_data\"],\n", " runner_module=\"fedmkt_runner\",\n", " runner_class=\"FedMKTRunner\",\n", ")\n", "\n", "homo_nn_0.arbiter.task_parameters(\n", " runner_conf=get_llm_conf()\n", ")\n", "\n", "homo_nn_0.guest.task_parameters(\n", " runner_conf=get_slm_conf(slm_idx=0)\n", ")\n", "\n", "for idx in range(3):\n", " homo_nn_0.hosts[idx].task_parameters(\n", " runner_conf=get_slm_conf(slm_idx=idx + 1)\n", " )\n", "\n", "homo_nn_0.guest.conf.set(\"launcher_name\", \"deepspeed\") # tell schedule engine to run task with deepspeed\n", "homo_nn_0.hosts[[0, 1, 2]].conf.set(\"launcher_name\", \"deepspeed\") # tell schedule engine to run task with deepspeed\n", "homo_nn_0.arbiter.conf.set(\"launcher_name\", \"deepspeed\") # tell schedule engine to run task with deepspeed\n", "\n", "pipeline.add_tasks([reader_0, homo_nn_0])\n", "pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 1})) # the number of gpus of each party\n", "\n", "pipeline.compile()\n", "pipeline.fit()\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: doc/tutorial/inferdpt/inferdpt_tutorial.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "341aeb6e-9e25-4a0e-9664-a32ab11293fa", "metadata": {}, "source": [ "# Inferdpt Tutorial" ] }, { "cell_type": "markdown", "id": "0b40afd5-77b9-45c6-a761-81b9a6bddc05", "metadata": {}, "source": [ "## Introduction of Inferdpt\n", "\n", "Inferdpt is an advanced algorithm framework designed for efficient and privacy-preserving text generation using large language models (LLMs). The framework addresses privacy concerns related to data leakage and unauthorized information collection in LLMs. Inferdpt implements Differential Privacy mechanisms to protect sensitive information during the inference process with black-box LLMs.\n", "\n", "Inferdpt comprises two key modules: the \"perturbation module\" and the \"extraction module\". The perturbation module utilizes a differentially private(DP) mechanism to generate a perturbed prompt from the raw document, facilitating privacy-preserving inference with black-box LLMs. The extraction module, inspired by knowledge distillation and retrieval-augmented generation, processes the perturbed text to produce coherent and consistent output. This ensures that the text generation quality of InferDPT is comparable to that of non-private LLMs, maintaining high utility while providing strong privacy guarantees.\n", "\n", "To further enhance privacy protection, Inferdpt integrates a novel mechanism called RANTEXT. RANTEXT introduces the concept of random adjacency list for token-level perturbation, addressing the vulnerability of existing differentially private mechanisms to embedding inversion attacks.\n", "\n", "For more details of Inferdpt, please refer to the [original paper](https://arxiv.org/pdf/2310.12214.pdf)." ] }, { "cell_type": "markdown", "id": "ac982b2d-4a71-45a5-a2b1-90259711f36b", "metadata": {}, "source": [ "## Use InferDPT" ] }, { "cell_type": "markdown", "id": "042049c5-80ce-4786-9896-88baddd59f4e", "metadata": {}, "source": [ "In this section, we will guide you through the process of:\n", "- Setting up the inferdpt toolkit with an existing language model.\n", "- Creating a model inference tool using the built-in class.\n", "- Executing a step-by-step walkthrough of an inference instance: Employing inferdpt to generate rationale responses for question-answering tasks." ] }, { "cell_type": "markdown", "id": "e1938eef-106d-4cc0-a9b7-6ad8d9d281f5", "metadata": {}, "source": [ "### Create Inferdpt Kit" ] }, { "cell_type": "markdown", "id": "565aa2ed-5919-4aa0-9499-23b730434c62", "metadata": {}, "source": [ "In alignment with the original paper, the implementation of differential privacy in inferdpt involves the random substitution of tokens in the original text with semantically similar words. To facilitate this process, it is necessary to precalculate the similarities between a subset of tokens from the vocabulary of the remote large language model. In this tutorial, we will utilize the Mistral-7B model as our remote large language model and the Qwen1.5-0.5B model as the local decoding model. For the sake of computational efficiency, we will select a subset of 11,400 tokens from the Mistral-7B vocabulary to perform the similarity calculations and use the built-in function to finally get the inferdpt-kit.\n", "\n", "Firstly we load the mistral model to get the embedding set:" ] }, { "cell_type": "code", "execution_count": 1, "id": "f01a229a-52e1-4a97-af06-a2ab122b7083", "metadata": {}, "outputs": [], "source": [ "# load embeddings from mistral model\n", "import numpy as np\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "model_path = '/data/cephfs/llm/models/Mistral-7B-Instruct-v0.2/'\n", "tokenizer = AutoTokenizer.from_pretrained(model_path)\n", "model = AutoModelForCausalLM.from_pretrained(model_path)\n", "embeddings = tokenizer.get_vocab() # get embeddings matrix" ] }, { "cell_type": "code", "execution_count": 13, "id": "3f7ec40b-1a58-4608-b2c1-3299979e699a", "metadata": {}, "outputs": [], "source": [ "# Get the embedding layer weights\n", "dtype = np.float32\n", "embedding_weights = model.get_input_embeddings().weight\n", "# Convert the embedding layer weights to numpy\n", "embedding_weights_np = embedding_weights.detach().numpy().astype(dtype)" ] }, { "cell_type": "markdown", "id": "07261aee-b676-4a42-9098-2923fa67519c", "metadata": {}, "source": [ "Then we select english tokens from the vocabulary. Then we can get an embedding matrix and a corresponding token list." ] }, { "cell_type": "code", "execution_count": 26, "id": "dbb231f9-f0ca-4add-bb45-f4fb59429abb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32000/32000 [00:00<00:00, 663000.04it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "11400\n" ] } ], "source": [ "import tqdm\n", "import re\n", "\n", "def contains_english_chars(string):\n", " pattern = r'[a-zA-Z]'\n", " match = re.search(pattern, string)\n", " return bool(match)\n", "\n", "def contains_non_english_chars(string):\n", " pattern = r'[^a-zA-Z]'\n", " match = re.search(pattern, string)\n", " return bool(match)\n", "\n", "def filter_tokens(token2index):\n", " filtered_index2token = {}\n", " for key, idx in tqdm.tqdm(token2index.items()):\n", " if key.startswith('<'):\n", " continue\n", " if not key.startswith('▁'):\n", " continue\n", " val_ = key.replace(\"▁\", \"\")\n", " if val_ == val_.upper():\n", " continue\n", " if contains_non_english_chars(val_):\n", " continue\n", " if 3 < len(val_) < 16 and contains_english_chars(val_):\n", " filtered_index2token[idx] = key\n", "\n", " return filtered_index2token\n", "\n", "filtered_index2token = filter_tokens(embeddings)\n", "used_num_tokens = len(filtered_index2token)\n", "print(used_num_tokens)\n", "for idx, token in filtered_index2token.items():\n", " token_2_embedding[token] = embedding_weights_np[idx].tolist()\n", "token_list = list(token_2_embedding.keys())\n", "embedding_matrix = np.array(list(token_2_embedding.values()), dtype=dtype)" ] }, { "cell_type": "code", "execution_count": 27, "id": "5922a177-d752-485d-98ab-9fd6688198f8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "we got the embedding matrix:\n", "[[-6.1035156e-04 -4.5471191e-03 -5.2795410e-03 ... -1.3656616e-03\n", " 4.2419434e-03 -8.1634521e-04]\n", " [ 4.8522949e-03 5.9814453e-03 1.1596680e-03 ... -2.6702881e-03\n", " -1.7471313e-03 9.9182129e-04]\n", " [-2.7465820e-03 4.3029785e-03 3.3874512e-03 ... -2.6092529e-03\n", " -1.2397766e-05 -3.4027100e-03]\n", " ...\n", " [-6.1340332e-03 -5.3405762e-03 -1.0910034e-03 ... -9.3841553e-04\n", " -7.4005127e-04 -7.3852539e-03]\n", " [-4.5166016e-03 8.2015991e-04 4.8217773e-03 ... -1.1978149e-03\n", " -1.0528564e-03 -2.1362305e-03]\n", " [ 1.2054443e-03 1.9836426e-03 -2.8419495e-04 ... -1.5792847e-03\n", " -2.8381348e-03 -7.1716309e-04]]\n" ] } ], "source": [ "print('we got the embedding matrix:')\n", "print(embedding_matrix)" ] }, { "cell_type": "markdown", "id": "20890d89-998f-4f38-968a-2a6a0648b050", "metadata": {}, "source": [ "We can easily prepare the pre-computed data we needed for inferdpt by using the built-in function of the InferDPTKit class:" ] }, { "cell_type": "code", "execution_count": 28, "id": "c90c7099-d20d-4009-bb7a-aeb3b46210b2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "11400it [00:37, 300.99it/s]\n", "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4096/4096 [00:03<00:00, 1147.93it/s]\n" ] } ], "source": [ "from fate_llm.algo.inferdpt.utils import InferDPTKit\n", "param = InferDPTKit.make_inferdpt_kit_param(embedding_matrix, token_list)" ] }, { "cell_type": "markdown", "id": "0fe3a722-5cdf-4393-90f3-e5d7b82051cf", "metadata": {}, "source": [ "Great, the computation is complete! Now, let’s proceed to perturb a sentence using inferdpt with ε (epsilon) set to 3.0. We will also save the perturbed sentence to a designated folder for future reference." ] }, { "cell_type": "code", "execution_count": 33, "id": "4a6acd81-bc7c-49d3-86f4-ad0b5c329e61", "metadata": {}, "outputs": [], "source": [ "inferdpt_kit = InferDPTKit(*param, tokenizer)" ] }, { "cell_type": "code", "execution_count": 96, "id": "0077696c-0c10-4500-8835-6e72a084bc42", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'into the tree to the woods'" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inferdpt_kit.perturb('From the river to the ocean', epsilon=3.0)" ] }, { "cell_type": "code", "execution_count": 97, "id": "8df2f57a-e202-4d4f-a175-21990223dc3d", "metadata": {}, "outputs": [], "source": [ "save_kit_path = 'your path'\n", "inferdpt_kit.save_to_path(save_kit_path)" ] }, { "cell_type": "markdown", "id": "ced7f1bf-aa49-4806-92e2-712493bb4b10", "metadata": {}, "source": [ "### Go through Inferdpt Step by Step\n", "\n", "Next, we will guide you through the process of using inferdpt step by step. We will simulate the interaction between the client and server locally. Before we begin, let’s discuss model inference. Within fate-llm's inferdpt module, we provide three types of model inference classes: vllm, vllm server, and Huggingface native. You can explore these classes in the [code files](../../../python/fate_llm/algo/inferdpt/inference/) or develop your own inference tool based on your specific needs. We highly recommend using vllm server. In this case, we will use the following two commands to launch two large model services, corresponding to the server’s LLM and the local decoding small model.\n", "\n", "For this example, we have executed the process on a machine equipped with four V100-32G GPUs. We advise you to adjust the model path and GPU settings as necessary to accommodate the specifications of your own machine.\n", "\n", "Start vllm server using commands below:" ] }, { "cell_type": "code", "execution_count": null, "id": "b1b6c3c7-6ddd-4386-8700-c95f74a2bae0", "metadata": {}, "outputs": [], "source": [ "! python -m vllm.entrypoints.openai.api_server --host 127.0.0.1 --port 8888 --model ./Mistral-7B-Instruct-v0.2 --dtype=half --enforce-eager --tensor-parallel-size 4 --gpu-memory-utilization 0.6" ] }, { "cell_type": "code", "execution_count": null, "id": "48374cb5-3a5b-456c-9d53-219c2468da63", "metadata": {}, "outputs": [], "source": [ "! python -m vllm.entrypoints.openai.api_server --host 127.0.0.1 --port 8887 --model ./Qwen1.5-0.5B --dtype=half --enforce-eager --tensor-parallel-size 4 --gpu-memory-utilization 0.2" ] }, { "cell_type": "markdown", "id": "375e3f0d-36c7-4ab3-8e65-cccac23e93c6", "metadata": {}, "source": [ "Next, we will initialize the inference instance, which are the parameters for both the inferdpt client and server. This includes specifying the IP address, port, and the model name of the service that has been started." ] }, { "cell_type": "code", "execution_count": 130, "id": "cd099ef4-569d-45b6-9765-502b688c3fb4", "metadata": {}, "outputs": [], "source": [ "from fate_llm.inference.api import APICompletionInference\n", "# for client\n", "inference_client = APICompletionInference(api_url=\"http://127.0.0.1:8887/v1\", model_name='./Qwen1.5-0.5B', api_key='EMPTY')\n", "# for server\n", "inference_server = APICompletionInference(api_url=\"http://127.0.0.1:8888/v1\", model_name='./Mistral-7B-Instruct-v0.2', api_key='EMPTY')" ] }, { "cell_type": "code", "execution_count": 135, "id": "8c430c14-2180-4f02-8f06-3f41bae1a710", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " I am a new user of this forum. I am a 20 year\n" ] } ], "source": [ "ret = inference_client.inference(['Hello how are you?'], inference_kwargs={\n", " 'stop': ['<|im_end|>', '\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 16\n", "})\n", "print(ret[0])" ] }, { "cell_type": "code", "execution_count": 138, "id": "6341eb48-e30f-46d4-aeaa-8c6fd27259b9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " I am an artificial intelligence designed to assist with information and answer questions to the best of my ability. I don't have the ability to have a personal identity or emotions. I'm here to help you with any inquiries you may have. How can I assist you today?\n" ] } ], "source": [ "ret = inference_server.inference(['[INST]Who are u?[/INST]'], inference_kwargs={\n", " 'stop': [''],\n", " 'temperature': 0.01,\n", " 'max_tokens': 128\n", "})\n", "print(ret[0])" ] }, { "cell_type": "markdown", "id": "90f5ce55-de3b-481a-a9cc-cb4c24edb7c2", "metadata": {}, "source": [ "In this tutorial, we will use a question-answering (QA) task as our illustrative example. To do so, we will extract a sample from the ARC-E dataset for demonstration purposes, here is the example:" ] }, { "cell_type": "code", "execution_count": 100, "id": "f912f986-ae86-4d57-9ebc-534d6404173c", "metadata": {}, "outputs": [], "source": [ "test_example = {'id': 'Mercury_7220990',\n", "'question': 'Which factor will most likely cause a person to develop a fever?',\n", "'choices': {'text': ['a leg muscle relaxing after exercise',\n", "'a bacterial population in the bloodstream',\n", "'several viral particles on the skin',\n", "'carbohydrates being digested in the stomach'],\n", "'label': ['A', 'B', 'C', 'D']},\n", "'answerKey': 'B'}" ] }, { "cell_type": "markdown", "id": "a98c74a8-7760-438b-b4f4-33178fed8761", "metadata": {}, "source": [ "Before initiating the inference, it's crucial to understand the sequence of steps involved. We will leverage the Jinja2 template engine to structure our documentation as follows:\n", "\n", "1. **Document Template Organization**: The initial step is to organize the document dictionary using the DOC TEMPLATE. This template will provide the structure for the input document.\n", "\n", "2. **Differential Privacy Perturbation**: Apply Differential Privacy (DP) to perturb the structured document string. This will result in a perturbed document. The perturbed document is then added to the original document under the key 'perturbed_doc'. Note that you can modify this key according to your parameter settings.\n", "\n", "3. **Instruction Addition**: Use the INSTRUCTION TEMPLATE to add instructions (or few-shot examples) to the perturbed document. This modified document is then sent to the server side for processing. The server's response is captured, and this perturbed response is appended to the original document under the key 'perturbed_response'. As before, this key can be adjusted as needed.\n", "\n", "4. **Decode Template Formatting**: Finally, employ the decode template to format the decode prompt. The resulting inference is then added to the original dictionary under the key 'inferdpt_result'. This key, like the others, can be customized to fit your specific parameters.\n", "\n", "By following these steps, the inferdpt framework enables a structured and privacy-preserving inference process, leading to a final output that incorporates the perturbed data and the model's response.\n", "For more details, you can refer to the source codes:\n" ] }, { "cell_type": "markdown", "id": "09d7377a-22f4-4d04-b886-88faa1384d7f", "metadata": {}, "source": [ "The templates for this example are defined on the client side. Below is the Jinja template we use:" ] }, { "cell_type": "code", "execution_count": 141, "id": "eff74a65-f765-483f-a685-418376414ff0", "metadata": {}, "outputs": [], "source": [ "doc_template = \"\"\"{{question}} \n", "Choices:{{choices.text}}\n", "\"\"\"\n", "\n", "instruction_template=\"\"\"\n", "[INST]\n", "Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\"\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Please explain:\n", "Question:{{perturbed_doc}}\n", "Rationale:\n", "[/INST]\n", "\"\"\"\n", "\n", "decode_template = \"\"\"Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\"\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Question:{{perturbed_doc}}\n", "Rationale:{{perturbed_response | replace('\\n', '')}}\n", "\n", "Please explain:\n", "Question:{{question}} \n", "Choices:{{choices.text}}\n", "Rationale:\n", "\"\"\"" ] }, { "cell_type": "markdown", "id": "b3c02898-91df-48b6-a0e2-9af5bd5538d8", "metadata": {}, "source": [ "Please be aware that we have included a one-shot example in the prompt to ensure that the Large Language Model (LLM) responds as anticipated.\n", "\n", "Now we create two script: \n", "- inferdpt_client.py\n", "- inferdpt_server.py\n", "\n", "And run codes provided below:" ] }, { "cell_type": "markdown", "id": "f0f110a3-c601-4c7b-8e89-8684d2ae266d", "metadata": {}, "source": [ "#### Client Side: inferdpt_client.py" ] }, { "cell_type": "code", "execution_count": null, "id": "006612b8-da8d-402c-9b6d-b6786325fa7c", "metadata": {}, "outputs": [], "source": [ "from fate_llm.inference.api import APICompletionInference\n", "from fate_llm.algo.inferdpt import inferdpt\n", "from fate_llm.algo.inferdpt.utils import InferDPTKit\n", "import sys\n", "\n", "\n", "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))\n", "\n", "\n", "ctx = create_ctx(guest)\n", "save_kit_path = 'your path'\n", "kit = InferDPTKit.load_from_path(save_kit_path)\n", "inference = APICompletionInference(api_url=\"http://127.0.0.1:8887/v1\", model_name='./Qwen1.5-0.5B', api_key='EMPTY')\n", "\n", "test_example = {'id': 'Mercury_7220990',\n", "'question': 'Which factor will most likely cause a person to develop a fever?',\n", "'choices': {'text': ['a leg muscle relaxing after exercise',\n", "'a bacterial population in the bloodstream',\n", "'several viral particles on the skin',\n", "'carbohydrates being digested in the stomach'],\n", "'label': ['A', 'B', 'C', 'D']},\n", "'answerKey': 'B'}\n", "\n", "\n", "doc_template = \"\"\"{{question}} \n", "Choices:{{choices.text}}\n", "\"\"\"\n", "\n", "instruction_template=\"\"\"\n", "[INST]\n", "Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\"\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Please explain:\n", "Question:{{perturbed_doc}}\n", "Rationale:\n", "[/INST]\n", "\"\"\"\n", "\n", "decode_template = \"\"\"Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", "Use to finish your rationle.\"\n", "\n", "Example(s):\n", "Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", "Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", "Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", "Question:{{perturbed_doc}}\n", "Rationale:{{perturbed_response | replace('\\n', '')}}\n", "\n", "Please explain:\n", "Question:{{question}} \n", "Choices:{{choices.text}}\n", "Rationale:\n", "\"\"\"\n", "\n", "inferdpt_client = inferdpt.InferDPTClient(ctx, kit, inference, epsilon=3.0)\n", "result = inferdpt_client.inference([test_example], doc_template, instruction_template, decode_template, \\\n", " remote_inference_kwargs={\n", " 'stop': ['<\\s>'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " },\n", " local_inference_kwargs={\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " })\n", "print('result is {}'.format(result[0]['inferdpt_result']))" ] }, { "cell_type": "markdown", "id": "e6ed3c0e-0b1f-4087-b155-def3ee957618", "metadata": {}, "source": [ "#### Server Side: inferdpt_server.py" ] }, { "cell_type": "code", "execution_count": null, "id": "96e3e9fa-9554-4bcf-b8bf-358c469014bf", "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.inferdpt.inferdpt import InferDPTServer\n", "import sys\n", "from fate_llm.algo.inferdpt.inference.api import APICompletionInference\n", "\n", "\n", "arbiter = (\"arbiter\", 10000)\n", "guest = (\"guest\", 10000)\n", "host = (\"host\", 9999)\n", "name = \"fed1\"\n", "\n", "\n", "def create_ctx(local):\n", " from fate.arch import Context\n", " from fate.arch.computing.backends.standalone import CSession\n", " from fate.arch.federation.backends.standalone import StandaloneFederation\n", " import logging\n", "\n", " logger = logging.getLogger()\n", " logger.setLevel(logging.INFO)\n", "\n", " console_handler = logging.StreamHandler()\n", " console_handler.setLevel(logging.INFO)\n", "\n", " formatter = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n", " console_handler.setFormatter(formatter)\n", "\n", " logger.addHandler(console_handler)\n", " computing = CSession(data_dir=\"./session_dir\")\n", " return Context(computing=computing, federation=StandaloneFederation(computing, name, local, [guest, host, arbiter]))\n", "\n", "\n", "ctx = create_ctx(arbiter)\n", "inference_server = APICompletionInference(api_url=\"http://127.0.0.1:8888/v1\", model_name='./Mistral-7B-Instruct-v0.2', api_key='EMPTY')\n", "inferdpt_server = InferDPTServer(ctx, inference)\n", "inferdpt_server.inference()" ] }, { "cell_type": "markdown", "id": "bfef704b-179e-44cd-84dc-a40b036e7f28", "metadata": {}, "source": [ "Start two terminal and launch client&server scripts simultaneously.\n", "On the client side we can get the answer:\n", "\n", "```\n", "The given question asks which factor will most likely cause a person to develop a fever. The factors mentioned are a leg muscle relaxing after exercise, a bacterial population in the bloodstream, several viral particles on the skin, and carbohydrates being digested in the stomach. The question is asking which factor is most likely to cause a person to develop a fever. The factors are all related to the body's internal environment, but the most likely factor is a bacterial population in the bloodstream. This is because bacteria can cause a fever, and the body's immune system responds to the infection by producing antibodies that can fight off the bacteria. Therefore, the answer is 'a bacterial population in the bloodstream'\n", "```" ] }, { "cell_type": "markdown", "id": "adf80b4e-4727-4ee4-b0b1-4839bd516f4f", "metadata": {}, "source": [ "## Use Inferdpt in FATE Pipeline" ] }, { "cell_type": "markdown", "id": "b9b560e3-8db4-4828-a4fc-494320a9a3e5", "metadata": {}, "source": [ "We can leverage the FATE pipeline to submit inference tasks for industrial applications. When operating in pipeline mode, to safeguard against privacy breaches such as API key or server path leakage, it is crucial to create initialization scripts for establishing inferdpt client instances. Alternatively, you can modify the provided scripts within the fate_llm/algo/inferdpt/init folder.\n", "\n", "Below, we provide an overview of the default_init.py script, which serves as an example of how to create an [initialization class](../../../python/fate_llm/algo/inferdpt/init/default_init.py). By customizing the static variables within this class, you can configure the client and server to interact with the Large Language Model (LLM) interfaces as intended." ] }, { "cell_type": "code", "execution_count": null, "id": "eab49960-0541-4059-b84d-bee4bb690974", "metadata": {}, "outputs": [], "source": [ "from fate_llm.algo.inferdpt.init._init import InferClientInit\n", "from fate_llm.inference.api import APICompletionInference\n", "from fate_llm.algo.inferdpt import inferdpt\n", "from fate_llm.algo.inferdpt.utils import InferDPTKit\n", "from fate_llm.algo.inferdpt.inferdpt import InferDPTClient, InferDPTServer\n", "\n", "\n", "class InferDPTAPIClientInit(InferClientInit):\n", "\n", " api_url = ''\n", " api_model_name = ''\n", " api_key = 'EMPTY'\n", " inferdpt_kit_path = ''\n", " eps = 3.0\n", "\n", " def __init__(self, ctx):\n", " super().__init__(ctx)\n", " self.ctx = ctx\n", "\n", " def get_inst(self)-> InferDPTClient:\n", " inference = APICompletionInference(api_url=self.api_url, model_name=self.api_model_name, api_key=self.api_key)\n", " kit = InferDPTKit.load_from_path(self.inferdpt_kit_path)\n", " inferdpt_client = inferdpt.InferDPTClient(self.ctx, kit, inference, epsilon=self.eps)\n", " return inferdpt_client\n", "\n", "\n", "class InferDPTAPIServerInit(InferClientInit):\n", "\n", " api_url = ''\n", " api_model_name = ''\n", " api_key = 'EMPTY'\n", "\n", " def __init__(self, ctx):\n", " super().__init__(ctx)\n", " self.ctx = ctx\n", "\n", " def get_inst(self)-> InferDPTServer:\n", " inference = APICompletionInference(api_url=self.api_url, model_name=self.api_model_name, api_key=self.api_key)\n", " inferdpt_server = inferdpt.InferDPTServer(self.ctx,inference_inst=inference)\n", " return inferdpt_server\n", " " ] }, { "cell_type": "markdown", "id": "0a5c9d6b-94b9-4ae3-80f7-20d1a698764c", "metadata": {}, "source": [ "In the pipeline example, we use arc_easy dataset and our built-in huggingface dataset. Only HuggingfaceDataset is supported in the pipeline mode:" ] }, { "cell_type": "code", "execution_count": 1, "id": "15276057-fdda-4cc6-8678-eb1f485e4c58", "metadata": {}, "outputs": [], "source": [ "from fate_llm.dataset.hf_dataset import HuggingfaceDataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "31cce967-3f5f-4261-ae17-9089368b82f9", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "dataset = load_dataset('arc_easy')\n", "dataset.save_to_disk('your_path/arc_easy')" ] }, { "cell_type": "code", "execution_count": 6, "id": "af9adcb4-766d-45f6-a13c-5c127df61e5b", "metadata": {}, "outputs": [], "source": [ "ds = HuggingfaceDataset(load_from_disk= True, data_split_key='train')\n", "ds.load('your_path/arc_easy')" ] }, { "cell_type": "code", "execution_count": 7, "id": "899c410f-fe68-4f7e-936e-8b11720ff148", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'id': 'Mercury_7220990', 'question': 'Which factor will most likely cause a person to develop a fever?', 'choices': {'text': ['a leg muscle relaxing after exercise', 'a bacterial population in the bloodstream', 'several viral particles on the skin', 'carbohydrates being digested in the stomach'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'B'}\n" ] } ], "source": [ "print(ds[0])" ] }, { "cell_type": "markdown", "id": "5f69f8cf-f40d-418a-be2a-753d67537442", "metadata": {}, "source": [ "After that, we can associate the dataset path with a name and namespace. By specifying the dataset configuration, the HuggingfaceDataset will be initialized and the dataset will be loaded from the specified path. \n", "```\n", "flow table bind --namespace experiment --name arc_e --path 'your_path/arc_easy'\n", "```\n", "Once these initialization scripts are in place, you can submit a pipeline task by specifying the initialization class in the configuration files. For more information, refer to the script provided below:" ] }, { "cell_type": "code", "execution_count": null, "id": "4da1aea7-0ba2-4ebb-918f-cfcf24d4498b", "metadata": {}, "outputs": [], "source": [ "import argparse\n", "from fate_client.pipeline.utils import test_utils\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from fate_client.pipeline import FateFlowPipeline\n", "\n", "\n", "def main(config=\"../../config.yaml\", namespace=\"\"):\n", " # obtain config\n", " if isinstance(config, str):\n", " config = test_utils.load_job_config(config)\n", " parties = config.parties\n", " guest = parties.guest[0]\n", " arbiter = parties.arbiter[0]\n", "\n", " pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n", "\n", " reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest))\n", " reader_0.guest.task_parameters(\n", " namespace=f\"experiment{namespace}\",\n", " name=\"arc_e\"\n", " )\n", "\n", " inferdpt_init_conf_client = {\n", " 'module_name': 'fate_llm.algo.inferdpt.init.default_init',\n", " 'item_name': 'InferDPTAPIClientInit'\n", " }\n", "\n", " dataset_conf = {\n", " 'module_name': 'fate_llm.dataset.hf_dataset',\n", " 'item_name': 'HuggingfaceDataset',\n", " 'kwargs':{\n", " 'load_from_disk': True,\n", " 'data_split_key': 'train'\n", " }\n", " }\n", "\n", " doc_template = \"\"\"{{question}} \n", " Choices:{{choices.text}}\n", " \"\"\"\n", "\n", " instruction_template=\"\"\"\n", " <|im_start|>system\n", " You are a helpful assistant.<|im_end|>\n", " <|im_start|>user\n", " Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", " Use to finish your rationle.\"\n", "\n", " Example(s):\n", " Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", " Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", " Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", " Please explain:\n", " Question:{{perturbed_doc}}\n", " Rationale:\n", " <|im_end|>\n", " <|im_start|>assistant\n", " \"\"\"\n", "\n", " decode_template = \"\"\"Select Answer from Choices and explain it in \"Rationale\" with few words. Please refer to the example to write the rationale.\n", " Use to finish your rationle.\"\n", "\n", " Example(s):\n", " Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n", " Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion']\n", " Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'.\n", "\n", " Question:{{perturbed_doc}}\n", " Rationale:{{perturbed_response | replace('\\n', '')}}\n", "\n", " Please explain:\n", " Question:{{question}} \n", " Choices:{{choices.text}}\n", " Rationale:\n", " \"\"\"\n", "\n", " remote_inference_kwargs={\n", " 'stop': [['<\\s>']],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " }\n", "\n", " local_inference_kwargs={\n", " 'stop': ['<|im_end|>', '', '\\n', '\\n\\n', '.\\n\\n\\n\\n\\n', '<|end_of_text|>', '>\\n\\n\\n'],\n", " 'temperature': 0.01,\n", " 'max_tokens': 256\n", " }\n", "\n", " inferdpt_client_conf = {\n", " 'inferdpt_init_conf': inferdpt_init_conf_client,\n", " 'dataset_conf': dataset_conf,\n", " 'doc_template': doc_template,\n", " 'instruction_template': instruction_template,\n", " 'decode_template': decode_template,\n", " 'dataset_conf': dataset_conf,\n", " 'remote_inference_kwargs': remote_inference_kwargs,\n", " 'local_inference_kwargs': local_inference_kwargs\n", " }\n", "\n", " inferdpt_init_conf_server = {\n", " 'module_name': 'fate_llm.algo.inferdpt.init.default_init',\n", " 'item_name': 'InferDPTAPIServerInit'\n", " }\n", "\n", " inferdpt_server_conf = {\n", " 'inferdpt_init_conf': inferdpt_init_conf_server\n", " }\n", "\n", " homo_nn_0 = HomoNN(\n", " 'nn_0',\n", " runner_module='inferdpt_runner',\n", " runner_class='InferDPTRunner',\n", " train_data=reader_0.outputs[\"output_data\"]\n", " )\n", "\n", " homo_nn_0.guest.task_parameters(runner_conf=inferdpt_client_conf)\n", " homo_nn_0.arbiter.task_parameters(runner_conf=inferdpt_server_conf)\n", " pipeline.add_tasks([reader_0, homo_nn_0])\n", " pipeline.compile()\n", " pipeline.fit()\n", "\n", "\n", "if __name__ == \"__main__\":\n", " parser = argparse.ArgumentParser(\"PIPELINE DEMO\")\n", " parser.add_argument(\"--config\", type=str, default=\"../config.yaml\",\n", " help=\"config file\")\n", " parser.add_argument(\"--namespace\", type=str, default=\"\",\n", " help=\"namespace for data stored in FATE\")\n", " args = parser.parse_args()\n", " main(config=args.config, namespace=args.namespace)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "c2345e19-83eb-4196-9606-74658c8fbdc5", "metadata": {}, "source": [ "# Offsite-tuning Tutorial" ] }, { "cell_type": "markdown", "id": "9f1d728c-09e1-418e-8d80-53dd0ec467b1", "metadata": {}, "source": [ "In this tutorial, we'll focus on how to leverage Offsite-Tuning framework in FATE-LLM-2.0 to fine-tune your LLM. You'll learn how to:\n", "\n", "1. Define models, including main models(which are at server side and will offer adapters and emulators) and submodel(which are at client side and will load adapters and emulators for local fine-tuning) compatible with Offsite-Tuning framework.\n", "2. Get hands-on experience with the Offsite-Tuning trainer.\n", "3. Define configurations for advanced setup(Using Deepspeed, offsite-tuning + federation) through FATE-pipeline." ] }, { "cell_type": "markdown", "id": "31432345-5cce-4efa-9a9b-844f997f14ad", "metadata": {}, "source": [ "## Introduction of Offsite-tuning\n", "\n", "Offsite-Tuning is a novel approach designed for the efficient and privacy-preserving adaptation of large foundational models for specific downstream tasks. The framework allows data owners to fine-tune models locally without uploading sensitive data to the LLM owner's servers. Specifically, the LLM owner sends a lightweight \"Adapter\" and a lossy compressed \"Emulator\" to the data owner. Using these smaller components, the data owner can then fine-tune the model solely on their private data. The Adapter, once fine-tuned, is returned to the model owner and integrated back into the large model to enhance its performance on the specific dataset.\n", "\n", "Offsite-Tuning addresses the challenge of unequal distribution of computational power and data. It allows thLLMel owner to enhance the model's capabilities without direct access to private data, while also enabling data owners who may not have the resources to train a full-scale model to fine-tune a portion of it using less computational power. This mutually beneficial arrangement accommodates both parties involve.\n", "\n", "Beyond the standard two-party setup involving the model owner and the data ownin FATE-LLM, er, Offsite-Tunframework ing is also extendable to scenarios with multiple data owners. FATE supports multi-party Offsite-Tuning, allowing multiple data owners to fine-tune and aggregate their Adapters locally, further enhancing the flexibility and applicability of this framewrFor more details of Offsite-tuning, please refer to the [original paper](https://arxiv.org/pdf/2302.04870.pdf).\n" ] }, { "cell_type": "markdown", "id": "2e7ac467-e5df-4bf3-8571-0a477ab4612d", "metadata": {}, "source": [ "## Preliminary\n", "\n", "We strongly recommend you finish reading our NN tutorial to get familiar with Model and Dataset customizations: [NN Tutorials](https://github.com/FederatedAI/FATE/blob/master/doc/2.0/fate/components/pipeline_nn_cutomization_tutorial.md)\n", "\n", "In this tutorial, we assume that you have deploy the codes of FATE(including fateflow & fate-client) & FATE-LLM-2.0. You can add python path so that you can run codes in the notebook." ] }, { "cell_type": "code", "execution_count": 4, "id": "f33516e8-0d28-4c97-bc38-ba28d60acf37", "metadata": {}, "outputs": [], "source": [ "import sys\n", "your_path_to_fate_python = 'xxx/fate/fate/python'\n", "sys.path.append(your_path_to_fate_python)" ] }, { "cell_type": "markdown", "id": "2f2fc794", "metadata": {}, "source": [ "If you install FATE & FATE-LLM-2.0 via pip, you can directly use the following codes." ] }, { "cell_type": "markdown", "id": "7309281b-5956-4158-9256-d6db230e086d", "metadata": {}, "source": [ "## Define Main Model and Sub Model\n", "\n", "Main models are at server side and will provides weights of adapters and emulators to client sides, while Sub Models are at client side and will load adapters and emulators for local fine-tuning. In this chapter we will take a standard GPT2 as the example and show you how to quickly develop main model class and sub model class for offsite-tuning.\n", "\n", "### Base Classes and Interfaces\n", "\n", "The base classes for the Main and Sub Models are OffsiteTuningMainModel and OffsiteTuningSubModel, respectively. To build your own models upon these base classes, you need to:\n", "\n", "1. Implement three key interfaces: get_base_model, get_model_transformer_blocks, and forward. The get_base_model interface should return the full Main or Sub Model. Meanwhile, the get_model_transformer_blocks function should return a ModuleList of all transformer blocks present in your language model, enabling the extraction of emulators and adapters from these blocks. Finally, you're required to implement the forward process for model inference.\n", "\n", "2. Supply the parameters emulator_layer_num, adapter_top_layer_num, and adapter_bottom_layer_num to the parent class. This allows the framework to automatically generate the top and bottom adapters as well as the dropout emulator for you. Specifically, the top adapters are taken from the top of the transformer blocks, while the bottom adapters are taken from the bottom. The emulator uses a dropout emulator consistent with the paper's specifications. Once the adapter layers are removed, the emulator is formed by selecting transformer blocks at fixed intervals and finally stack them to make a dropout emulator.\n", "\n", "Our framework will automatically detect the emulator and adapters of a main model, and send them to clients. Clients' models them load the weights of emulators and adapters to get trainable models.\n", "\n", "### Example\n", "\n", "Let us take a look of our built-in GPT-2 model. It will be easy for you to build main models and sub models based on the framework. Please notice that the GPT2LMHeadSubModel's base model is intialized from a GPTConfig, that is to say, it's weights are random and need to load pretrained weights from server." ] }, { "cell_type": "code", "execution_count": null, "id": "8611c115-0321-458f-b190-49dcb127a653", "metadata": {}, "outputs": [], "source": [ "from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel\n", "from transformers import GPT2LMHeadModel, GPT2Config\n", "from torch import nn\n", "import torch as t\n", "\n", "\n", "class GPT2LMHeadMainModel(OffsiteTuningMainModel):\n", "\n", " def __init__(\n", " self,\n", " model_name_or_path,\n", " emulator_layer_num: int,\n", " adapter_top_layer_num: int = 2,\n", " adapter_bottom_layer_num: int = 2):\n", "\n", " self.model_name_or_path = model_name_or_path\n", " super().__init__(\n", " emulator_layer_num,\n", " adapter_top_layer_num,\n", " adapter_bottom_layer_num)\n", "\n", " def get_base_model(self):\n", " return GPT2LMHeadModel.from_pretrained(self.model_name_or_path)\n", "\n", " def get_model_transformer_blocks(self, model: GPT2LMHeadModel):\n", " return model.transformer.h\n", "\n", " def forward(self, x):\n", " return self.model(**x)\n", "\n", "class GPT2LMHeadSubModel(OffsiteTuningSubModel):\n", "\n", " def __init__(\n", " self,\n", " model_name_or_path,\n", " emulator_layer_num: int,\n", " adapter_top_layer_num: int = 2,\n", " adapter_bottom_layer_num: int = 2,\n", " fp16_mix_precision=False,\n", " partial_weight_decay=None):\n", "\n", " self.model_name_or_path = model_name_or_path\n", " self.emulator_layer_num = emulator_layer_num\n", " self.adapter_top_layer_num = adapter_top_layer_num\n", " self.adapter_bottom_layer_num = adapter_bottom_layer_num\n", " super().__init__(\n", " emulator_layer_num,\n", " adapter_top_layer_num,\n", " adapter_bottom_layer_num,\n", " fp16_mix_precision)\n", " self.partial_weight_decay = partial_weight_decay\n", "\n", " def get_base_model(self):\n", " total_layer_num = self.emulator_layer_num + \\\n", " self.adapter_top_layer_num + self.adapter_bottom_layer_num\n", " config = GPT2Config.from_pretrained(self.model_name_or_path)\n", " config.num_hidden_layers = total_layer_num\n", " # initialize a model without pretrained weights\n", " return GPT2LMHeadModel(config)\n", "\n", " def get_model_transformer_blocks(self, model: GPT2LMHeadModel):\n", " return model.transformer.h\n", " \n", " def forward(self, x):\n", " return self.model(**x)\n" ] }, { "cell_type": "markdown", "id": "abd1f63f-afa7-4f09-a67e-63812ddcd801", "metadata": {}, "source": [ "We can define a server side model and a client side model that can work together in the offsite-tuning:" ] }, { "cell_type": "code", "execution_count": null, "id": "04870e76-11cc-4d79-a09e-b6fd16ed2f23", "metadata": {}, "outputs": [], "source": [ "model_main = GPT2LMHeadMainModel('gpt2', 4, 2, 2)\n", "model_sub = GPT2LMHeadSubModel('gpt2', 4, 2, 2)" ] }, { "cell_type": "markdown", "id": "19d34937-b4ae-436e-b4ea-1620fb80bed4", "metadata": {}, "source": [ "### Share additional parameters with clients\n", "\n", "Additionally, beyond the weights of emulators and adapters, you may also want to share other model parameters, such as embedding weights, with your client partners. To achieve this, you'll need to implement two more interfaces: get_additional_param_state_dict and load_additional_param_state_dict for both the Main and Sub Models." ] }, { "cell_type": "code", "execution_count": null, "id": "189fce0e-8e4d-4368-8e14-907b30ce0a49", "metadata": {}, "outputs": [], "source": [ "def get_additional_param_state_dict(self):\n", " # get parameter of additional parameter\n", " model = self.model\n", " param_dict = {\n", " 'wte': model.transformer.wte,\n", " 'wpe': model.transformer.wpe,\n", " 'last_ln_f': model.transformer.ln_f\n", " }\n", "\n", " addition_weights = self.get_numpy_state_dict(param_dict)\n", "\n", " wte = addition_weights.pop('wte')\n", " wte_dict = split_numpy_array(wte, 10, 'wte')\n", " wpe = addition_weights.pop('wpe')\n", " wpe_dict = split_numpy_array(wpe, 10, 'wpe')\n", " addition_weights.update(wte_dict)\n", " addition_weights.update(wpe_dict)\n", " return addition_weights\n", "\n", "def load_additional_param_state_dict(self, submodel_weights: dict):\n", " # load additional weights:\n", " model = self.model\n", " param_dict = {\n", " 'wte': model.transformer.wte,\n", " 'wpe': model.transformer.wpe,\n", " 'last_ln_f': model.transformer.ln_f\n", " }\n", "\n", " new_submodel_weight = {}\n", " new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']\n", " wte_dict, wpe_dict = {}, {}\n", " for k, v in submodel_weights.items():\n", " if 'wte' in k:\n", " wte_dict[k] = v\n", " if 'wpe' in k:\n", " wpe_dict[k] = v\n", " wte = recover_numpy_array(wte_dict, 'wte')\n", " wpe = recover_numpy_array(wpe_dict, 'wpe')\n", " new_submodel_weight['wte'] = wte\n", " new_submodel_weight['wpe'] = wpe\n", "\n", " self.load_numpy_state_dict(param_dict, new_submodel_weight)" ] }, { "cell_type": "markdown", "id": "59d9aa6a-80e9-4130-8af1-c7d2bd0fbba3", "metadata": {}, "source": [ "From these codes we can see that we use 'split_numpy_array, recover_numpy_array' to cut embedding weights into pieces and recover them." ] }, { "cell_type": "markdown", "id": "dda6f5e3-d05a-4cdf-afd4-affbc162fce4", "metadata": {}, "source": [ "## Submit a Offsite-tuning Task - A QA Task Sample with GPT2\n", "\n", "Now we are going to show you how to run a 2 party(server & client) offsite-tuning task using the GPT-2 model defined above. Before we submit the task we need to prepare the QA dataset.\n", "\n", "### Prepare QA Dataset - Sciq\n", "\n", "In this example, we use sciq dataset. You can use tools provided in our qa_dataset.py to tokenize the sciq dataset and save the tokenized result. **Remember to modify the save_path to your own path.** For the sake of simplicity, in this tutorial, for every party we only use this dataset to train the model." ] }, { "cell_type": "code", "execution_count": 1, "id": "84f6947e-f0a3-4a42-9549-a9776a15b66d", "metadata": {}, "outputs": [], "source": [ "from fate_llm.dataset.qa_dataset import tokenize_qa_dataset\n", "from transformers import AutoTokenizer\n", "tokenizer_name_or_path = 'gpt2'\n", "tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)\n", "\n", "if 'llama' in tokenizer_name_or_path:\n", " tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, unk_token=\"\", bos_token=\"\", eos_token=\"\", add_eos_token=True) \n", " tokenizer.pad_token = tokenizer.eos_token\n", "else:\n", " tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)\n", "if 'gpt2' in tokenizer_name_or_path:\n", " tokenizer.pad_token = tokenizer.eos_token\n", "\n", "import os\n", "# bind data path to name & namespace\n", "save_path = 'xxxx/sciq'\n", "rs = tokenize_qa_dataset('sciq', tokenizer, save_path, seq_max_len=600) # we save the cache dataset to the fate root folder" ] }, { "cell_type": "markdown", "id": "adabe89a-37be-4c64-bd83-4f8c8b80096f", "metadata": {}, "source": [ "We can use our built-in QA dataset to load tokenized dataset, to see if everything is working correctly." ] }, { "cell_type": "code", "execution_count": 12, "id": "6500c2ba-bc39-4db4-b2ea-947fb09c334e", "metadata": {}, "outputs": [], "source": [ "from fate_llm.dataset.qa_dataset import QaDataset\n", "\n", "ds = QaDataset(tokenizer_name_or_path=tokenizer_name_or_path)\n", "ds.load(save_path)" ] }, { "cell_type": "code", "execution_count": 13, "id": "d6f62b60-eed0-4bd0-874e-ae3feeebb120", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11679\n", "600\n" ] } ], "source": [ "print(len(ds)) # train set length\n", "print(ds[0]['input_ids'].__len__()) # first sample length" ] }, { "cell_type": "markdown", "id": "0609c63d-35a4-43bc-bd4b-f1c61adea587", "metadata": {}, "source": [ "## Submit a Task\n", "\n", "Now the model and the dataset is prepared! We can submit a training task. In the FATE-2.0, you can define your pipeline in a much easier manner.\n", "\n", "After we submit the task below, the following process will occur: The server and client each initialize their respective models. The server extracts shared parameters and sends them to the client. The client then loads these parameters and conducts training on a miniaturized GPT-2 model composed of an emulator and adapter on SciqP \n", "\n", "If you are not familiar with trainer configuration, please refer to [NN Tutorials](https://github.com/FederatedAI/FATE/blob/master/doc/2.0/fate/components/pipeline_nn_cutomization_tutorial.md).\n", "\n", " Upon completion of the training, the client sends the adapter parameters back to the server. Since we are directly using Hugging Face's LMHeadGPT2, there's no need to supply a loss function. Simply inputting the preprocessed data and labels into the model will calculate the correct loss and proceed with gradient descent\n", "\n", "One thing to pay special attention to is that Offsite-Tuning differs from FedAvg within FATE. In Offsite-Tuning, the server (the arbiter role) needs to initialize the model. Therefore, please refer to the example below and set the runner conf separately for the client and the server.\n", "\n", "To make this a quick demo, we only select 100 samples from the origin qa datset, see 'select_num=100' in the LLMDatasetLoader." ] }, { "cell_type": "markdown", "id": "261dfb43", "metadata": {}, "source": [ "### Bind Dataset Path with Name & Namespace\n", "\n", "Plase execute the following code to bind the dataset path with name & namespace. Remember to modify the path to your own dataset save path." ] }, { "cell_type": "code", "execution_count": null, "id": "8dc1e82b", "metadata": {}, "outputs": [], "source": [ "! flow table bind --namespace experiment --name sciq --path YOUR_SAVE_PATH" ] }, { "cell_type": "markdown", "id": "0e8c5ff4", "metadata": {}, "source": [ "### Pipeline codes" ] }, { "cell_type": "code", "execution_count": 16, "id": "c9113d10-c3e7-4875-9502-ce46aa0b86b1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import time\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from fate_client.pipeline import FateFlowPipeline\n", "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_conf_of_ot_runner\n", "from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments\n", "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n", "from fate_client.pipeline.components.fate.nn.torch.base import Sequential\n", "from fate_client.pipeline.components.fate.nn.torch import nn\n", "\n", "\n", "guest = '9999'\n", "host = '9999'\n", "arbiter = '9999'\n", "\n", "pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n", "pipeline.set_site_party_id('9999')\n", "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest))\n", "reader_0.guest.task_parameters(\n", " namespace=\"experiment\",\n", " name=\"sciq\"\n", ")\n", "\n", "client_model = LLMModelLoader(\n", " module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadSubModel',\n", " model_name_or_path='gpt2',\n", " emulator_layer_num=4,\n", " adapter_top_layer_num=1,\n", " adapter_bottom_layer_num=1\n", ")\n", "\n", "server_model = LLMModelLoader(\n", " module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadMainModel',\n", " model_name_or_path='gpt2',\n", " emulator_layer_num=4,\n", " adapter_top_layer_num=1,\n", " adapter_bottom_layer_num=1 \n", ")\n", "\n", "train_args = Seq2SeqTrainingArguments(\n", " per_device_train_batch_size=1,\n", " learning_rate=5e-5,\n", " disable_tqdm=False,\n", " num_train_epochs=1,\n", " logging_steps=10,\n", " logging_strategy='steps',\n", " use_cpu=False\n", ")\n", "\n", "dataset = LLMDatasetLoader(\n", " module_name='qa_dataset', item_name='QaDataset',\n", " tokenizer_name_or_path='gpt2',\n", " select_num=100\n", ")\n", "\n", "data_collator = LLMDataFuncLoader(module_name='data_collator.cust_data_collator', item_name='get_seq2seq_data_collator', tokenizer_name_or_path='gpt2')\n", "\n", "client_conf = get_conf_of_ot_runner(\n", " model=client_model,\n", " dataset=dataset,\n", " data_collator=data_collator,\n", " training_args=train_args,\n", " fed_args=FedAVGArguments(),\n", " aggregate_model=False\n", ")\n", "\n", "server_conf = get_conf_of_ot_runner(\n", " model=server_model,\n", " dataset=dataset,\n", " data_collator=data_collator,\n", " training_args=train_args,\n", " fed_args=FedAVGArguments(),\n", " aggregate_model=False\n", ")\n", "\n", "homo_nn_0 = HomoNN(\n", " 'nn_0',\n", " train_data=reader_0.outputs[\"output_data\"],\n", " runner_module=\"offsite_tuning_runner\",\n", " runner_class=\"OTRunner\"\n", ")\n", "\n", "homo_nn_0.guest.task_parameters(runner_conf=client_conf)\n", "homo_nn_0.arbiter.task_parameters(runner_conf=server_conf)\n", "pipeline.add_tasks([reader_0, homo_nn_0])\n", "pipeline.compile()" ] }, { "cell_type": "markdown", "id": "e97c2823", "metadata": {}, "source": [ "You can try to initialize your models, datasets to check if they can be loaded correctly." ] }, { "cell_type": "code", "execution_count": 17, "id": "872817e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GPT2LMHeadSubModel(\n", " (model): GPT2LMHeadModel(\n", " (transformer): GPT2Model(\n", " (wte): Embedding(50257, 768)\n", " (wpe): Embedding(1024, 768)\n", " (drop): Dropout(p=0.1, inplace=False)\n", " (h): ModuleList(\n", " (0): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (1): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (2): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (3): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (4): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (5): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", " )\n", " (emulator): ModuleList(\n", " (0): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (1): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (2): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (3): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (adapter_bottom): ModuleList(\n", " (0): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (adapter_top): ModuleList(\n", " (0): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", ")\n", "**********\n", "\n", "**********\n", "DataCollatorForSeq2Seq(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={\n", "\t50256: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n", "}, model=None, padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')\n" ] } ], "source": [ "print(client_model())\n", "print('*' * 10)\n", "print(dataset())\n", "print('*' * 10)\n", "print(data_collator())" ] }, { "cell_type": "markdown", "id": "898c3491", "metadata": {}, "source": [ "Seems that everything is ready! Now we can submit the task. Submit the code below to submit your task." ] }, { "cell_type": "code", "execution_count": 2, "id": "74497742-4030-4a7a-a13e-2c020da47cd1", "metadata": {}, "outputs": [], "source": [ "pipeline.fit()" ] }, { "cell_type": "markdown", "id": "b33b2e2b-3b53-4881-8db6-a67e1293e88b", "metadata": {}, "source": [ "## Add Deepspeed Setting\n", "\n", "By simply adding a ds_config, we can run our task with a deepspeed backend. If you have deployed eggroll envoironment, you can submmit the task with deepspeed to eggroll accelerate your training." ] }, { "cell_type": "code", "execution_count": 5, "id": "6e8f063b-263c-4ba5-b2ba-98a86ce38b94", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import time\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from fate_client.pipeline import FateFlowPipeline\n", "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_conf_of_ot_runner\n", "from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments\n", "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n", "from peft import LoraConfig, TaskType\n", "from transformers.modeling_utils import unwrap_model\n", "\n", "\n", "guest = '10000'\n", "host = '10000'\n", "arbiter = '10000'\n", "\n", "# pipeline = FateFlowPipeline().set_parties(guest=guest, host=host, arbiter=arbiter)\n", "pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n", "\n", "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest))\n", "reader_0.guest.task_parameters(\n", " namespace=\"experiment\",\n", " name=\"sciq\"\n", ")\n", "\n", "client_model = LLMModelLoader(\n", " module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadSubModel',\n", " model_name_or_path='gpt2',\n", " emulator_layer_num=18,\n", " adapter_top_layer_num=2,\n", " adapter_bottom_layer_num=2\n", ")\n", "\n", "server_model = LLMModelLoader(\n", " module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadMainModel',\n", " model_name_or_path='gpt2',\n", " emulator_layer_num=18,\n", " adapter_top_layer_num=2,\n", " adapter_bottom_layer_num=2 \n", ")\n", "\n", "dataset = LLMDatasetLoader(\n", " module_name='qa_dataset', item_name='QaDataset',\n", " tokenizer_name_or_path='gpt2',\n", " select_num=100\n", ")\n", "\n", "data_collator = LLMDataFuncLoader(module_name='data_collator.cust_data_collator', item_name='get_seq2seq_data_collator', tokenizer_name_or_path='gpt2')\n", "\n", "batch_size = 1\n", "lr = 5e-5\n", "ds_config = {\n", " \"train_micro_batch_size_per_gpu\": batch_size,\n", " \"optimizer\": {\n", " \"type\": \"Adam\",\n", " \"params\": {\n", " \"lr\": lr,\n", " \"torch_adam\": True,\n", " \"adam_w_mode\": False\n", " }\n", " },\n", " \"fp16\": {\n", " \"enabled\": True\n", " },\n", " \"gradient_accumulation_steps\": 1,\n", " \"zero_optimization\": {\n", " \"stage\": 2,\n", " \"allgather_partitions\": True,\n", " \"allgather_bucket_size\": 1e8,\n", " \"overlap_comm\": True,\n", " \"reduce_scatter\": True,\n", " \"reduce_bucket_size\": 1e8,\n", " \"contiguous_gradients\": True,\n", " \"offload_optimizer\": {\n", " \"device\": \"cpu\"\n", " },\n", " \"offload_param\": {\n", " \"device\": \"cpu\"\n", " }\n", " }\n", "}\n", "\n", "train_args = Seq2SeqTrainingArguments(\n", " per_device_train_batch_size=1,\n", " learning_rate=5e-5,\n", " disable_tqdm=False,\n", " num_train_epochs=1,\n", " logging_steps=10,\n", " logging_strategy='steps',\n", " dataloader_num_workers=4,\n", " use_cpu=False,\n", " deepspeed=ds_config, # Add deepspeed config here\n", " remove_unused_columns=False,\n", " fp16=True\n", ")\n", "\n", "client_conf = get_conf_of_ot_runner(\n", " model=client_model,\n", " dataset=dataset,\n", " data_collator=data_collator,\n", " training_args=train_args,\n", " fed_args=FedAVGArguments(),\n", " aggregate_model=False,\n", ")\n", "\n", "server_conf = get_conf_of_ot_runner(\n", " model=server_model,\n", " dataset=dataset,\n", " data_collator=data_collator,\n", " training_args=train_args,\n", " fed_args=FedAVGArguments(),\n", " aggregate_model=False\n", ")\n", "\n", "\n", "homo_nn_0 = HomoNN(\n", " 'nn_0',\n", " train_data=reader_0.outputs[\"output_data\"],\n", " runner_module=\"offsite_tuning_runner\",\n", " runner_class=\"OTRunner\"\n", ")\n", "\n", "homo_nn_0.guest.task_parameters(runner_conf=client_conf)\n", "homo_nn_0.arbiter.task_parameters(runner_conf=server_conf)\n", "\n", "# if you have deployed eggroll, you can add this line to submit your job to eggroll\n", "homo_nn_0.guest.conf.set(\"launcher_name\", \"deepspeed\")\n", "\n", "pipeline.add_tasks([reader_0, homo_nn_0])\n", "pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 4}))\n", "pipeline.compile()\n", "pipeline.fit()\n" ] }, { "cell_type": "markdown", "id": "97249681-c3a3-43bd-8167-7ae3f4e1616b", "metadata": {}, "source": [ "## Offsite-tuning + Multi Client Federation\n", "\n", "\n", "The Offsite-Tuning + FedAVG federation is configured based on the standard Offsite-Tuning. In this situation, you need to add data input & configurations for all clients. And do remember to add 'aggregate_model=True' for client & server conf so that model federation will be conducted during the training." ] }, { "cell_type": "code", "execution_count": null, "id": "fdbdc60c-a948-4be3-bba6-519d8640b0a9", "metadata": {}, "outputs": [], "source": [ "import time\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from fate_client.pipeline import FateFlowPipeline\n", "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_conf_of_ot_runner\n", "from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments\n", "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMCustFuncLoader\n", "from peft import LoraConfig, TaskType\n", "\n", "\n", "guest = '10000'\n", "host = '10000'\n", "arbiter = '10000'\n", "\n", "pipeline = FateFlowPipeline().set_parties(guest=guest, host=host, arbiter=arbiter)\n", "\n", "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest, host=host))\n", "reader_0.guest.task_parameters(\n", " namespace=\"experiment\",\n", " name=\"sciq\"\n", ")\n", "reader_0.hosts[0].task_parameters(\n", " namespace=\"experiment\",\n", " name=\"sciq\"\n", ")\n", "\n", "client_model = LLMModelLoader(\n", " module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadSubModel',\n", " model_name_or_path='gpt2',\n", " emulator_layer_num=4,\n", " adapter_top_layer_num=1,\n", " adapter_bottom_layer_num=1\n", ")\n", "\n", "server_model = LLMModelLoader(\n", " module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadMainModel',\n", " model_name_or_path='gpt2',\n", " emulator_layer_num=4,\n", " adapter_top_layer_num=1,\n", " adapter_bottom_layer_num=1 \n", ")\n", "\n", "dataset = LLMDatasetLoader(\n", " module_name='qa_dataset', item_name='QaDataset',\n", " tokenizer_name_or_path='gpt2',\n", " select_num=100\n", ")\n", "\n", "data_collator = LLMCustFuncLoader(module_name='cust_data_collator', item_name='get_seq2seq_tokenizer', model_path='gpt2')\n", "\n", "train_args = Seq2SeqTrainingArguments(\n", " per_device_train_batch_size=1,\n", " learning_rate=5e-5,\n", " disable_tqdm=False,\n", " num_train_epochs=1,\n", " logging_steps=10,\n", " logging_strategy='steps',\n", " dataloader_num_workers=4\n", ")\n", "\n", "client_conf = get_conf_of_ot_runner(\n", " model=client_model,\n", " dataset=dataset,\n", " data_collator=data_collator,\n", " training_args=train_args,\n", " fed_args=FedAVGArguments(),\n", " aggregate_model=True\n", ")\n", "\n", "server_conf = get_conf_of_ot_runner(\n", " model=server_model,\n", " dataset=dataset,\n", " data_collator=data_collator,\n", " training_args=train_args,\n", " fed_args=FedAVGArguments(),\n", " aggregate_model=True\n", ")\n", "\n", "homo_nn_0 = HomoNN(\n", " 'nn_0',\n", " train_data=reader_0.outputs[\"output_data\"],\n", " runner_module=\"offsite_tuning_runner\",\n", " runner_class=\"OTRunner\"\n", ")\n", "\n", "homo_nn_0.guest.task_parameters(runner_conf=client_conf)\n", "homo_nn_0.hosts[0].task_parameters(runner_conf=client_conf)\n", "homo_nn_0.arbiter.task_parameters(runner_conf=server_conf)\n", "\n", "pipeline.add_tasks([reader_0, homo_nn_0])\n", "\n", "pipeline.compile()\n", "pipeline.fit()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: doc/tutorial/offsite_tuning/README.md ================================================ # Offsite-Tuning ## Standard Offsite-tuning Offsite-Tuning is designed for the efficient adaptation of large foundational models for specific downstream tasks. Through Offsite-Tuning, the model owner can enhance the capabilities of large models using data providers without having to disclose the full model weights and directly access the data providers' sensitive information. Specifically, the LLM owner sends a lightweight "Adapter" and a lossy compressed "Emulator" to the data owner. Using these smaller components, the data owner can then fine-tune the model solely on their private data. The Adapter, once fine-tuned, is returned to the model owner and integrated back into the large model to enhance its performance on the specific dataset. In FATE-LLM 1.3, we provide these built-in models: - GPT2 series models (e.g., GPT2, GPT2-XL, etc.) - Bloom series models (such as Bloom7B) - Llama-1 series models (e.g., Llama7B) FATE-LLM v1.3 builds on v1.2 and offers the ability to easily configure multi-machine and multi-card acceleration. It also has specialized optimizations for the network transmission of adapters and emulators. [Read the full paper](https://arxiv.org/abs/2302.04870)

## Offsite-tuning with Federated Learning In addition to supporting standard two-party (model owner and data provider) offsite-tuning, FATE also supports offsite-tuning with multiple data providers simultaneously. Adapters can be fine-tuned locally and then aggregated with those from other data providers. Ultimately, large models can be enhanced through the secure aggregation of adapters from multiple parties. This approach can be used to address issues related to the uneven distribution of computational power and data. As shown in the diagram below:
================================================ FILE: doc/tutorial/pellm/ChatGLM3-6B_ds.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Federated ChatGLM3 Tuning with Parameter Efficient methods in FATE-LLM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this tutorial, we will demonstrate how to efficiently train federated ChatGLM3-6B with deepspeed using the FATE-LLM framework. In FATE-LLM, we introduce the \"pellm\"(Parameter Efficient Large Language Model) module, specifically designed for federated learning with large language models. We enable the implementation of parameter-efficient methods in federated learning, reducing communication overhead while maintaining model performance. In this tutorial we particularlly focus on ChatGLM3-6B, and we will also emphasize the use of the Adapter mechanism for fine-tuning ChatGLM3-6B, which enables us to effectively reduce communication volume and improve overall efficiency.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## FATE-LLM: ChatGLM3-6B\n", "\n", "### ChatGLM-6B\n", "ChatGLM3-6B is a large transformer-based language model with 5.977 billion parameters, it is an open bilingual language model based on General Language Model. You can download the pretrained model from [here](https://github.com/THUDM/ChatGLM3), or let the program automatically download it when you use it later.\n", "\n", "### Current Features\n", "\n", "In current version, FATE-LLM: ChatGLM-6B supports the following features:\n", "
\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiment Setting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before running experiment, please make sure that [FATE-LLM Cluster](https://github.com/FederatedAI/FATE/wiki/Download#llm%E9%83%A8%E7%BD%B2%E5%8C%85) has been deployed. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset: Advertising Text Generation\n", "\n", "This is an advertising test generateion dataset, you can download dataset from the following links and place it in the examples/data folder. \n", "- [data link 1](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view)\n", "- [data link 2](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) \n", "\n", "You can refer to following link for more details about [data](https://aclanthology.org/D19-1321.pdf)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "df = pd.read_json('${fate_install}/examples/data/AdvertiseGen/train.json', lines=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ChatGLM3-6B with Adapter\n", "\n", "In this section, we will guide you through the process of finetuning ChatGLM-6B with adapters using the FATE-LLM framework. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "ChatGLM model is located on fate_llm/model_zoo/chatglm.py, can be use directly" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "albert.py bloom.py distilbert.py parameter_efficient_llm.py\n", "bart.py chatglm.py gpt2.py\t qwen.py\n", "bert.py deberta.py llama.py roberta.py\n" ] } ], "source": [ "! ls ../../../../fate_llm/python/fate_llm/model_zoo/pellm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Adapters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can directly use adapters from the peft. See details for adapters on this page [Adapter Methods](https://huggingface.co/docs/peft/index) for more details. By specifying the adapter name and the adapter\n", "config dict we can insert adapters into our language models:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from peft import LoraConfig, TaskType\n", "\n", "lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n", " target_modules=['query_key_value'],\n", ")\n", "lora_config.target_modules = list(lora_config.target_modules) # this line is needed to ensure lora_config is jsonable" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Init ChatGLM3 Model " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader\n", "\n", "pretrained_model_path = \"fill with pretrained model download path please\"\n", "\n", "model = LLMModelLoader(\n", " \"pellm.chatglm\",\n", " \"ChatGLM\",\n", " pretrained_path=pretrained_model_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " trust_remote_code=True\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**During the training process, all weights of the pretrained language model will be frozen, and weights of adapters are traininable. Thus, FATE-LLM only train in the local training and aggregate adapters' weights in the fedederation process**\n", "\n", "Now available adapters are [Adapters Overview](https://huggingface.co/docs/peft/index) for details.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Specify Dataset And DataCollator To Process Data" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from fate_client.pipeline.components.fate.nn.loader import LLMDatasetLoader, LLMDataFuncLoader\n", "\n", "tokenizer_params = dict(\n", " tokenizer_name_or_path=pretrained_model_path,\n", " trust_remote_code=True,\n", ")\n", "\n", "dataset = LLMDatasetLoader(\n", " \"prompt_dataset\",\n", " \"PromptDataset\",\n", " **tokenizer_params,\n", ")\n", "\n", "data_collator = LLMDataFuncLoader(\n", " \"data_collator.cust_data_collator\",\n", " \"get_seq2seq_data_collator\",\n", " **tokenizer_params,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Init DeepSpeed Config" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "ds_config = {\n", " \"train_micro_batch_size_per_gpu\": 1,\n", " \"optimizer\": {\n", " \"type\": \"Adam\",\n", " \"params\": {\n", " \"lr\": 5e-4\n", " }\n", " },\n", " \"fp16\": {\n", " \"enabled\": True\n", " },\n", " \"gradient_accumulation_steps\": 1,\n", " \"zero_optimization\": {\n", " \"stage\": 2,\n", " \"allgather_partitions\": True,\n", " \"allgather_bucket_size\": 1e8,\n", " \"overlap_comm\": True,\n", " \"reduce_scatter\": True,\n", " \"reduce_bucket_size\": 1e8,\n", " \"contiguous_gradients\": True,\n", " \"offload_optimizer\": {\n", " \"device\": \"cpu\"\n", " },\n", " \"offload_param\": {\n", " \"device\": \"cpu\"\n", " }\n", " }\n", "}\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit Federated Task\n", "To run federated task, please make sure to ues fate>=2.1.0 and deploy it with gpu machines. To running this code, make sure training data path is already binded. The following code shoud be copy to a script and run in a command line like \"python federated_chatglm.py\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can use this script to submit the model, but submitting the model will take a long time to train and generate a long log, so we won't do it here." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "from fate_client.pipeline.components.fate.reader import Reader\n", "from fate_client.pipeline import FateFlowPipeline\n", "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_seq2seq_runner\n", "from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments\n", "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n", "from peft import LoraConfig, TaskType\n", "\n", "\n", "guest = '10000'\n", "host = '10000'\n", "arbiter = '10000'\n", "\n", "epochs = 1\n", "batch_size = 1\n", "lr = 5e-4\n", "\n", "ds_config = {\n", " \"train_micro_batch_size_per_gpu\": batch_size,\n", " \"optimizer\": {\n", " \"type\": \"Adam\",\n", " \"params\": {\n", " \"lr\": lr,\n", " \"torch_adam\": True,\n", " \"adam_w_mode\": False\n", " }\n", " },\n", " \"fp16\": {\n", " \"enabled\": True\n", " },\n", " \"gradient_accumulation_steps\": 1,\n", " \"zero_optimization\": {\n", " \"stage\": 2,\n", " \"allgather_partitions\": True,\n", " \"allgather_bucket_size\": 1e8,\n", " \"overlap_comm\": True,\n", " \"reduce_scatter\": True,\n", " \"reduce_bucket_size\": 1e8,\n", " \"contiguous_gradients\": True,\n", " \"offload_optimizer\": {\n", " \"device\": \"cpu\"\n", " },\n", " \"offload_param\": {\n", " \"device\": \"cpu\"\n", " }\n", " }\n", "}\n", "\n", "pipeline = FateFlowPipeline().set_parties(guest=guest, host=host, arbiter=arbiter)\n", "# pipeline.bind_local_path(path=\"\", namespace=\"experiment\", name=\"ad\")\n", "time.sleep(5)\n", "\n", "\n", "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest, host=host))\n", "reader_0.guest.task_parameters(\n", " namespace=\"experiment\",\n", " name=\"ad\"\n", ")\n", "reader_0.hosts[0].task_parameters(\n", " namespace=\"experiment\",\n", " name=\"ad\"\n", ")\n", "\n", "# define lora config\n", "lora_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n", " target_modules=['query_key_value'],\n", ")\n", "lora_config.target_modules = list(lora_config.target_modules)\n", "\n", "pretrained_model_path = \"/data/cephfs/llm/models/chatglm3-6b\"\n", "\n", "model = LLMModelLoader(\n", " \"pellm.chatglm\",\n", " \"ChatGLM\",\n", " pretrained_path=pretrained_model_path,\n", " peft_type=\"LoraConfig\",\n", " peft_config=lora_config.to_dict(),\n", " trust_remote_code=True\n", ")\n", "\n", "\n", "tokenizer_params = dict(\n", " tokenizer_name_or_path=pretrained_model_path,\n", " trust_remote_code=True,\n", ")\n", "\n", "dataset = LLMDatasetLoader(\n", " \"prompt_dataset\",\n", " \"PromptDataset\",\n", " **tokenizer_params,\n", ")\n", "\n", "data_collator = LLMDataFuncLoader(\n", " \"data_collator.cust_data_collator\",\n", " \"get_seq2seq_data_collator\",\n", " **tokenizer_params,\n", ")\n", "\n", "conf = get_config_of_seq2seq_runner(\n", " algo='fedavg',\n", " model=model,\n", " dataset=dataset,\n", " data_collator=data_collator,\n", " training_args=Seq2SeqTrainingArguments(\n", " num_train_epochs=epochs,\n", " per_device_train_batch_size=batch_size,\n", " remove_unused_columns=False, \n", " predict_with_generate=False,\n", " deepspeed=ds_config,\n", " learning_rate=lr,\n", " use_cpu=False, # this must be set as we will gpu\n", " fp16=True,\n", " ),\n", " fed_args=FedAVGArguments(),\n", " task_type='causal_lm',\n", " save_trainable_weights_only=True # only save trainable weights\n", ")\n", "\n", "homo_nn_0 = HomoNN(\n", " 'nn_0',\n", " runner_conf=conf,\n", " train_data=reader_0.outputs[\"output_data\"],\n", " runner_module=\"homo_seq2seq_runner\",\n", " runner_class=\"Seq2SeqRunner\",\n", ")\n", "\n", "homo_nn_0.guest.conf.set(\"launcher_name\", \"deepspeed\") # tell schedule engine to run task with deepspeed\n", "homo_nn_0.hosts[0].conf.set(\"launcher_name\", \"deepspeed\") # tell schedule engine to run task with deepspeed\n", "\n", "pipeline.add_tasks([reader_0, homo_nn_0])\n", "pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 1})) # the number of gpus of each party\n", "\n", "pipeline.compile()\n", "pipeline.fit()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training With P-Tuning V2 Adapter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To use another adapter lke P-Tuning V2, slightly changes is needed!" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "model = LLMModelLoader(\n", " \"pellm.chatglm\",\n", " \"ChatGLM\",\n", " pretrained_path=pretrained_model_path,\n", " pre_seq_len=128,\n", " trust_remote_code=True\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inference" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Models trained with FATE-LLM can be find under the directory `${fate_install}/fateflow/model/$job_id/${role}/${party_id}/$cpn_name/0/output/output_model/model_directory/adapter_model.bin}`,\n", "The following code is an example to load trained lora adapter weights:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import sys\n", "import torch\n", "from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model\n", "from transformers import AutoModel, AutoTokenizer\n", "\n", "\n", "def load_model(pretrained_model_path):\n", " _tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path, trust_remote_code=True)\n", " _model = AutoModel.from_pretrained(pretrained_model_path, trust_remote_code=True)\n", "\n", " _model = _model.half()\n", " _model = _model.eval()\n", "\n", " return _model, _tokenizer\n", "\n", "\n", "def load_data(data_path):\n", " with open(data_path, \"r\") as fin:\n", " for _l in fin:\n", " yield json.loads(_l.strip())\n", "\n", "\n", "chatglm_model_path = \"\"\n", "model, tokenizer = load_model(chatglm_model_path)\n", "\n", "test_data_path = \"{fate_install}/examples/data/AdvertiseGen/dev.json\"\n", "dataset = load_data(test_data_path)\n", "\n", "peft_path = \"${fate_install}/fateflow/model/$job_id/${role}/${party_id}/$cpn_name/0/output/output_model/model_directory/adapter_model.bin}\"\n", "\n", "model = PeftModel.from_pretrained(model, peft_path)\n", "model = model.half()\n", "model.eval()\n", "\n", "for p in model.parameters():\n", " if p.requires_grad:\n", " print(p)\n", "\n", "model.cuda(\"cuda:0\")\n", "\n", "content = list(dataset)[0][\"content\"]\n", "print(model.chat(tokenizer, content, do_sample=False))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: doc/tutorial/pellm/builtin_pellm_models.md ================================================ ## Builtin PELLM Models FATE-LLM provide some builtin pellm models, users can use them simply to efficiently train their language models. To use these models, please read the using tutorial of [ChatGLM-6B Training Guide](./ChatGLM3-6B_ds.ipynb). After reading the training tutorial above, it's easy to use other models listing in the following tabular by changing `module_name`, `class_name`, `dataset` list below. | Model | ModuleName | ClassName | DataSetName | | -------------- | ----------------- | --------------| --------------- | | Qwen2 | pellm.qwen | Qwen | prompt_dataset | | Bloom-7B1 | pellm.bloom | Bloom | prompt_dataset | | OPT-6.7B | pellm.opt | OPT | prompt_dataset | | LLaMA-2-7B | pellm.llama | LLaMa | prompt_dataset | | LLaMA-7B | pellm.llama | LLaMa | prompt_dataset | | ChatGLM3-6B | pellm.chatglm | ChatGLM | prompt_dataset | | GPT-2 | pellm.gpt2 | GPT2CLM | prompt_dataset | | GPT-2 | pellm.gpt2 | GPT2 | seq_cls_dataset | | ALBERT | pellm.albert | Albert | seq_cls_dataset | | BART | pellm.bart | Bart | seq_cls_dataset | | BERT | pellm.bert | Bert | seq_cls_dataset | | DeBERTa | pellm.deberta | Deberta | seq_cls_dataset | | DistilBERT | pellm.distilbert | DistilBert | seq_cls_dataset | | RoBERTa | pellm.roberta | Roberta | seq_cls_dataset | ================================================ FILE: examples/fedmkt/__init__.py ================================================ ================================================ FILE: examples/fedmkt/fedmkt.py ================================================ from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_fedmkt_runner from fate_client.pipeline.components.fate.nn.algo_params import FedMKTTrainingArguments, FedAVGArguments from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader from peft import LoraConfig, TaskType from fate_client.pipeline import FateFlowPipeline from fate_client.pipeline.components.fate.reader import Reader from transformers import AutoConfig import argparse import yaml from typing import Union, Dict def main(config="./config.yaml", param: Union[Dict, str] = None): if isinstance(config, str): with open(config, 'r') as f: config = yaml.safe_load(f) if isinstance(param, str): param = yaml.safe_load(param) guest = config['parties']['guest'][0] # replace with actual guest party ID host = config['parties']['host'][0] # replace with actual host party ID arbiter = config['parties']['arbiter'][0] # replace with actual arbiter party ID process_data_output_dir = config['paths']['process_data_output_dir'] llm_pretrained_path = config['paths']['llm_pretrained_path'] slm_pretrained_paths = config['paths']['slm_pretrained_paths'] vocab_mapping_directory = config['paths']['vocab_mapping_directory'] slm_to_llm_vocab_mapping_paths = [ vocab_mapping_directory + "/" + path for path in config['paths']['slm_to_llm_vocab_mapping_paths'] ] llm_to_slm_vocab_mapping_paths = [ vocab_mapping_directory + "/" + path for path in config['paths']['llm_to_slm_vocab_mapping_paths'] ] slm_models = config['models']['slm_models'] slm_lora_target_modules = config['lora_config']['slm_lora_target_modules'] def get_llm_conf(): lora_config = LoraConfig( task_type=TaskType.CAUSAL_LM, inference_mode=False, r=param['lora_config']['llm']['r'], lora_alpha=param['lora_config']['llm']['lora_alpha'], lora_dropout=param['lora_config']['llm']['lora_dropout'], target_modules=param['lora_config']['llm']['target_modules'] ) lora_config.target_modules = list(lora_config.target_modules) llm_model = LLMModelLoader( "pellm.llama", "LLaMa", pretrained_path=llm_pretrained_path, peft_type="LoraConfig", peft_config=lora_config.to_dict(), torch_dtype="bfloat16" ) pub_dataset = LLMDatasetLoader( "qa_dataset", "QaDataset", tokenizer_name_or_path=llm_pretrained_path, need_preprocess=True, dataset_name="arc_challenge", data_part="common", seq_max_len=512 ) training_args = FedMKTTrainingArguments( global_epochs=param['training']['llm']['global_epochs'], per_device_train_batch_size=param['training']['llm']['per_device_train_batch_size'], gradient_accumulation_steps=param['training']['llm']['gradient_accumulation_steps'], learning_rate=param['training']['llm']['learning_rate'], output_dir=param['training']['llm']['output_dir'], dataloader_num_workers=param['training']['llm']['dataloader_num_workers'], remove_unused_columns=param['training']['llm']['remove_unused_columns'], warmup_ratio=param['training']['llm']['warmup_ratio'], lr_scheduler_type=param['training']['llm']['lr_scheduler_type'], optim=param['training']['llm']['optim'], adam_beta1=param['training']['llm']['adam_beta1'], adam_beta2=param['training']['llm']['adam_beta2'], weight_decay=param['training']['llm']['weight_decay'], max_grad_norm=param['training']['llm']['max_grad_norm'], use_cpu=param['training']['llm']['use_cpu'], vocab_size=AutoConfig.from_pretrained(llm_pretrained_path).vocab_size, ) fed_args = FedAVGArguments( aggregate_strategy='epoch', aggregate_freq=1 ) tokenizer = LLMDataFuncLoader( "tokenizers.cust_tokenizer", "get_tokenizer", tokenizer_name_or_path=llm_pretrained_path ) slm_tokenizers = [ LLMDataFuncLoader("tokenizers.cust_tokenizer", "get_tokenizer", tokenizer_name_or_path=path) for path in slm_pretrained_paths ] return get_config_of_fedmkt_runner( model=llm_model, training_args=training_args, fed_args=fed_args, pub_dataset=pub_dataset, tokenizer=tokenizer, slm_tokenizers=slm_tokenizers, slm_to_llm_vocab_mapping_paths=slm_to_llm_vocab_mapping_paths, pub_dataset_path=process_data_output_dir, save_trainable_weights_only=True, ) def get_slm_conf(slm_idx): slm_pretrained_path = slm_pretrained_paths[slm_idx] lora_config = LoraConfig( task_type=TaskType.CAUSAL_LM, inference_mode=False, r=param['lora_config']['slm'][slm_idx]['r'], lora_alpha=param['lora_config']['slm'][slm_idx]['lora_alpha'], lora_dropout=param['lora_config']['slm'][slm_idx]['lora_dropout'], target_modules=param['lora_config']['slm'][slm_idx]['target_modules'] ) lora_config.target_modules = list(lora_config.target_modules) llm_to_slm_vocab_mapping = llm_to_slm_vocab_mapping_paths[slm_idx] slm_model = LLMModelLoader( slm_models[slm_idx][0], slm_models[slm_idx][1], pretrained_path=slm_pretrained_path, peft_type="LoraConfig", peft_config=lora_config.to_dict(), ) vocab_size = AutoConfig.from_pretrained(slm_pretrained_path).vocab_size pub_dataset = LLMDatasetLoader( "qa_dataset", "QaDataset", tokenizer_name_or_path=slm_pretrained_path, need_preprocess=True, dataset_name="arc_challenge", data_part="common", seq_max_len=512 ) priv_dataset = LLMDatasetLoader( "qa_dataset", "QaDataset", tokenizer_name_or_path=slm_pretrained_path, need_preprocess=True, dataset_name="arc_challenge", data_part="client_0", seq_max_len=512 ) training_args = FedMKTTrainingArguments( global_epochs=param['training']['slm']['global_epochs'], per_device_train_batch_size=param['training']['slm']['per_device_train_batch_size'], gradient_accumulation_steps=param['training']['slm']['gradient_accumulation_steps'], learning_rate=param['training']['slm']['learning_rate'] if slm_idx != 1 else 3e-4, output_dir=param['training']['slm']['output_dir'], dataloader_num_workers=param['training']['slm']['dataloader_num_workers'], remove_unused_columns=param['training']['slm']['remove_unused_columns'], warmup_ratio=param['training']['slm']['warmup_ratio'], lr_scheduler_type=param['training']['slm']['lr_scheduler_type'], optim=param['training']['slm']['optim'], adam_beta1=param['training']['slm']['adam_beta1'], adam_beta2=param['training']['slm']['adam_beta2'], weight_decay=param['training']['slm']['weight_decay'], max_grad_norm=param['training']['slm']['max_grad_norm'], use_cpu=param['training']['slm']['use_cpu'], vocab_size=vocab_size, ) fed_args = FedAVGArguments( aggregate_strategy='epoch', aggregate_freq=1 ) tokenizer = LLMDataFuncLoader( "tokenizers.cust_tokenizer", "get_tokenizer", tokenizer_name_or_path=slm_pretrained_path ) llm_tokenizer = LLMDataFuncLoader( "tokenizers.cust_tokenizer", "get_tokenizer", tokenizer_name_or_path=llm_pretrained_path ) data_collator = LLMDataFuncLoader( module_name='data_collator.cust_data_collator', item_name='get_seq2seq_data_collator', tokenizer_name_or_path=slm_pretrained_path ) return get_config_of_fedmkt_runner( model=slm_model, training_args=training_args, fed_args=fed_args, pub_dataset=pub_dataset, priv_dataset=priv_dataset, tokenizer=tokenizer, llm_tokenizer=llm_tokenizer, llm_to_slm_vocab_mapping_path=llm_to_slm_vocab_mapping, pub_dataset_path=process_data_output_dir, save_trainable_weights_only=True, data_collator=data_collator ) pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter, host=host) pipeline.bind_local_path(path=process_data_output_dir, namespace="experiment", name="arc_challenge") reader_0 = Reader("reader_0", runtime_parties=dict(guest=guest, host=host)) reader_0.guest.task_parameters( namespace=config['data']['guest']['namespace'], name=config['data']['guest']['name'] ) reader_0.hosts[[0, 1, 2]].task_parameters( namespace=config['data']['host']['namespace'], name=config['data']['host']['name'] ) homo_nn_0 = HomoNN( 'nn_0', train_data=reader_0.outputs["output_data"], runner_module="fedmkt_runner", runner_class="FedMKTRunner", ) homo_nn_0.arbiter.task_parameters( runner_conf=get_llm_conf() ) homo_nn_0.guest.task_parameters( runner_conf=get_slm_conf(slm_idx=0) ) for idx in range(1): homo_nn_0.hosts[idx].task_parameters( runner_conf=get_slm_conf(slm_idx=idx + 1) ) homo_nn_0.guest.conf.set("launcher_name", "deepspeed") # tell scheduler engine to run task with deepspeed homo_nn_0.hosts[0].conf.set("launcher_name", "deepspeed") # tell scheduler engine to run task with deepspeed homo_nn_0.arbiter.conf.set("launcher_name", "deepspeed") # tell scheduler engine to run task with deepspeed pipeline.add_tasks([reader_0, homo_nn_0]) pipeline.conf.set("task", dict(engine_run={"cores": 1})) # the number of gpus of each party pipeline.compile() pipeline.fit() if __name__ == "__main__": parser = argparse.ArgumentParser("LLMSUITE PIPELINE JOB") parser.add_argument("-c", "--config", type=str, help="config file", default="./config.yaml") parser.add_argument("-p", "--param", type=str, help="config file for params", default="./fedmkt_config.yaml") args = parser.parse_args() main(args.config, args.param) ================================================ FILE: examples/fedmkt/fedmkt_config.yaml ================================================ # fedmkt_config.yaml # Configuration for Lora lora_config: llm: r: 8 lora_alpha: 16 lora_dropout: 0.05 target_modules: - q_proj - k_proj - v_proj - o_proj slm: - # Configuration for the first SLM model r: 8 lora_alpha: 32 lora_dropout: 0.1 target_modules: - q_proj - v_proj - # Configuration for the second SLM model r: 8 lora_alpha: 32 lora_dropout: 0.1 target_modules: - c_attn # Training configuration training: llm: global_epochs: 5 per_device_train_batch_size: 1 gradient_accumulation_steps: 4 learning_rate: 3e-5 output_dir: "./" dataloader_num_workers: 4 remove_unused_columns: false warmup_ratio: 0.008 lr_scheduler_type: "cosine" optim: "adamw_torch" adam_beta1: 0.9 adam_beta2: 0.95 weight_decay: 0.1 max_grad_norm: 1.0 use_cpu: false slm: global_epochs: 5 per_device_train_batch_size: 1 gradient_accumulation_steps: 4 learning_rate: 3e-5 # Adjust learning rate for SLM models output_dir: "./" dataloader_num_workers: 4 remove_unused_columns: false warmup_ratio: 0.008 lr_scheduler_type: "cosine" optim: "adamw_torch" adam_beta1: 0.9 adam_beta2: 0.95 weight_decay: 0.1 max_grad_norm: 1.0 use_cpu: false # Paths configuration paths: process_data_output_dir: "" llm_pretrained_path: "Llama-2-7b-hf" slm_pretrained_paths: - "opt-1.3b" - "gpt2" vocab_mapping_directory: "" slm_to_llm_vocab_mapping_paths: - "opt_to_llama.json" - "gpt2_to_llama.json" - "llama_small_to_llama.json" llm_to_slm_vocab_mapping_paths: - "llama_to_opt.json" - "llama_to_gpt2.json" - "llama_to_llama_small" # Models configuration models: slm_models: - ["pellm.opt", "OPT"] - ["pellm.gpt2", "GPT2CLM"] # Data configuration data: guest: namespace: "experiment" name: "arc_challenge" host: namespace: "experiment" name: "arc_challenge" # Example: Additional custom configuration custom_config: some_param: "value" another_param: 123 ================================================ FILE: examples/fedmkt/test_fedmkt_llmsuit.yaml ================================================ data: - file: table_name: arc_challenge namespace: experiment role: guest_0 - file: table_name: arc_challenge namespace: experiment role: host_0 bloom_lora_vs_zero_shot: gpt2_fedmkt: pretrained: "gpt2" script: "./fedmkt.py" conf: "./fedmkt_config.yaml" ================================================ FILE: examples/offsite_tuning/__init__.py ================================================ ================================================ FILE: examples/offsite_tuning/offsite_tuning.py ================================================ import argparse import yaml from fate_client.pipeline.components.fate.reader import Reader from fate_client.pipeline import FateFlowPipeline from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_conf_of_ot_runner from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader from fate_client.pipeline.components.fate.nn.torch.base import Sequential from fate_client.pipeline.components.fate.nn.torch import nn def load_params(file_path): """Load and parse the YAML params file.""" with open(file_path, 'r') as f: params = yaml.safe_load(f) return params def setup_pipeline(params): """Set up the pipeline using the provided parameters.""" guest = params['pipeline']['guest'] arbiter = params['pipeline']['arbiter'] pretrained_model_path = params['paths']['pretrained_model_path'] pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter) reader = Reader("reader_0", runtime_parties=dict(guest=guest)) reader.guest.task_parameters( namespace=params['pipeline']['namespace'], name=params['pipeline']['name'] ) client_model = LLMModelLoader( module_name=params['models']['client']['module_name'], item_name=params['models']['client']['item_name'], model_name_or_path=pretrained_model_path, emulator_layer_num=params['models']['client']['emulator_layer_num'], adapter_top_layer_num=params['models']['client']['adapter_top_layer_num'], adapter_bottom_layer_num=params['models']['client']['adapter_bottom_layer_num'] ) server_model = LLMModelLoader( module_name=params['models']['server']['module_name'], item_name=params['models']['server']['item_name'], model_name_or_path=pretrained_model_path, emulator_layer_num=params['models']['server']['emulator_layer_num'], adapter_top_layer_num=params['models']['server']['adapter_top_layer_num'], adapter_bottom_layer_num=params['models']['server']['adapter_bottom_layer_num'] ) dataset = LLMDatasetLoader( module_name=params['dataset']['module_name'], item_name=params['dataset']['item_name'], tokenizer_name_or_path=params['dataset']['tokenizer_name_or_path'], select_num=params['dataset']['select_num'] ) data_collator = LLMDataFuncLoader( module_name=params['data_collator']['module_name'], item_name=params['data_collator']['item_name'], tokenizer_name_or_path=params['data_collator']['tokenizer_name_or_path'] ) train_args = Seq2SeqTrainingArguments( per_device_train_batch_size=params['training']['batch_size'], learning_rate=params['training']['learning_rate'], disable_tqdm=False, num_train_epochs=params['training']['num_train_epochs'], logging_steps=params['training']['logging_steps'], logging_strategy='steps', dataloader_num_workers=4, use_cpu=False, deepspeed=params['training']['deepspeed'], # Add DeepSpeed config here remove_unused_columns=False, fp16=True ) client_conf = get_conf_of_ot_runner( model=client_model, dataset=dataset, data_collator=data_collator, training_args=train_args, fed_args=FedAVGArguments(), aggregate_model=False, ) server_conf = get_conf_of_ot_runner( model=server_model, dataset=dataset, data_collator=data_collator, training_args=train_args, fed_args=FedAVGArguments(), aggregate_model=False ) homo_nn = HomoNN( 'nn_0', train_data=reader.outputs["output_data"], runner_module="offsite_tuning_runner", runner_class="OTRunner" ) homo_nn.guest.task_parameters(runner_conf=client_conf) homo_nn.arbiter.task_parameters(runner_conf=server_conf) # If using Eggroll, you can add this line to submit your job homo_nn.guest.conf.set("launcher_name", "deepspeed") pipeline.add_tasks([reader, homo_nn]) pipeline.conf.set("task", dict(engine_run=params['pipeline']['engine_run'])) pipeline.compile() pipeline.fit() def main(config_file, param_file): params = load_params(param_file) setup_pipeline(params) if __name__ == "__main__": parser = argparse.ArgumentParser("LLMSUITE Offsite-tuning JOB") parser.add_argument("-c", "--config", type=str, help="Path to config file", default="./config.yaml") parser.add_argument("-p", "--param", type=str, help="Path to parameter file", default="./test_offsite_tuning_llmsuite.yaml") args = parser.parse_args() main(args.config, args.param) ================================================ FILE: examples/offsite_tuning/offsite_tuning_config.yaml ================================================ # params.yaml paths: pretrained_model_path: 'gpt2' pipeline: guest: '9999' arbiter: '9999' namespace: 'experiment' name: 'sciq' engine_run: cores: 1 training: batch_size: 1 learning_rate: 5e-5 num_train_epochs: 1 logging_steps: 10 deepspeed: train_micro_batch_size_per_gpu: 1 optimizer: type: "Adam" params: lr: 5e-5 torch_adam: true adam_w_mode: false fp16: enabled: true gradient_accumulation_steps: 1 zero_optimization: stage: 2 allgather_partitions: true allgather_bucket_size: 1e8 overlap_comm: true reduce_scatter: true reduce_bucket_size: 1e8 contiguous_gradients: true offload_optimizer: device: "cpu" offload_param: device: "cpu" models: client: module_name: 'offsite_tuning.gpt2' item_name: 'GPT2LMHeadSubModel' emulator_layer_num: 11 adapter_top_layer_num: 2 adapter_bottom_layer_num: 2 server: module_name: 'offsite_tuning.gpt2' item_name: 'GPT2LMHeadMainModel' emulator_layer_num: 11 adapter_top_layer_num: 2 adapter_bottom_layer_num: 2 dataset: module_name: 'qa_dataset' item_name: 'QaDataset' tokenizer_name_or_path: 'gpt2' select_num: 100 data_collator: module_name: 'data_collator.cust_data_collator' item_name: 'get_seq2seq_data_collator' tokenizer_name_or_path: 'gpt2' ================================================ FILE: examples/offsite_tuning/test_offsite_tuning_llmsuite.yaml ================================================ data: - file: table_name: sciq namespace: experiment role: guest_0 - file: table_name: sciq namespace: experiment role: host_0 bloom_lora_vs_zero_shot: gpt2_ot: pretrained: "gpt2" script: "./offsite_tuning.py" conf: "./offsite_tuning_config.yaml" ================================================ FILE: examples/pellm/__init__.py ================================================ ================================================ FILE: examples/pellm/bloom_lora_config.yaml ================================================ data: guest: namespace: experiment name: ad host: namespace: experiment name: ad epoch: 1 batch_size: 4 lr: 5e-4 pretrained_model_path: bloom-560m peft_config: alpha_pattern: {} auto_mapping: null base_model_name_or_path: null bias: none fan_in_fan_out: false inference_mode: false init_lora_weights: true layers_pattern: null layers_to_transform: null loftq_config: { } lora_alpha: 32 lora_dropout: 0.1 megatron_config: null megatron_core: megatron.core modules_to_save: null peft_type: LORA r: 8 rank_pattern: { } revision: null target_modules: - query_key_value task_type: CAUSAL_LM use_rslora: false ds_config: fp16: enabled: true gradient_accumulation_steps: 1 optimizer: params: adam_w_mode: false lr: 5e-4 torch_adam: true type: Adam train_micro_batch_size_per_gpu: 4 zero_optimization: allgather_bucket_size: 100000000.0 allgather_partitions: true contiguous_gradients: true offload_optimizer: device: cpu offload_param: device: cpu overlap_comm: true reduce_bucket_size: 100000000.0 reduce_scatter: true stage: 2 ================================================ FILE: examples/pellm/test_bloom_lora.py ================================================ import time from fate_client.pipeline.components.fate.reader import Reader from fate_client.pipeline import FateFlowPipeline from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_seq2seq_runner from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader from peft import LoraConfig, TaskType from fate_client.pipeline.utils import test_utils import argparse import yaml from typing import Union, Dict def main(config="../../config.yaml", param: Union[Dict, str] = None, namespace=""): if isinstance(config, str): config = test_utils.load_job_config(config) if isinstance(param, str): param = yaml.safe_load(param) parties = config.parties guest = parties.guest[0] host = parties.host[0] arbiter = parties.arbiter[0] pipeline = FateFlowPipeline().set_parties(guest=guest, host=host, arbiter=arbiter) reader_0 = Reader("reader_0", runtime_parties=dict(guest=guest, host=host)) reader_0.guest.task_parameters( namespace=param["data"]["guest"]["namespace"], name=param["data"]["guest"]["name"] ) reader_0.hosts[0].task_parameters( namespace=param["data"]["host"]["namespace"], name=param["data"]["host"]["name"] ) lora_config = LoraConfig(**param["peft_config"]) lora_config.target_modules = list(lora_config.target_modules) pretrained_model_path = param["pretrained_model_path"] model = LLMModelLoader( "pellm.bloom", "Bloom", pretrained_path=pretrained_model_path, peft_type="LoraConfig", peft_config=lora_config.to_dict(), trust_remote_code=True ) tokenizer_params = dict( tokenizer_name_or_path=pretrained_model_path, trust_remote_code=True, ) dataset = LLMDatasetLoader( "prompt_dataset", "PromptDataset", **tokenizer_params, ) data_collator = LLMDataFuncLoader( "data_collator.cust_data_collator", "get_seq2seq_data_collator", **tokenizer_params, ) conf = get_config_of_seq2seq_runner( algo='fedavg', model=model, dataset=dataset, data_collator=data_collator, training_args=Seq2SeqTrainingArguments( num_train_epochs=param["epoch"], per_device_train_batch_size=param["batch_size"], remove_unused_columns=False, predict_with_generate=False, deepspeed=param["ds_config"], learning_rate=param["lr"], use_cpu=False, # this must be set as we will gpu fp16=True, ), fed_args=FedAVGArguments(), task_type='causal_lm', save_trainable_weights_only=True # only save trainable weights ) homo_nn_0 = HomoNN( 'nn_0', runner_conf=conf, train_data=reader_0.outputs["output_data"], runner_module="homo_seq2seq_runner", runner_class="Seq2SeqRunner", ) homo_nn_0.guest.conf.set("launcher_name", "deepspeed") # tell schedule engine to run task with deepspeed homo_nn_0.hosts[0].conf.set("launcher_name", "deepspeed") # tell schedule engine to run task with deepspeed pipeline.add_tasks([reader_0, homo_nn_0]) pipeline.conf.set("task", dict(engine_run={"cores": 1})) # the number of gpus of each party pipeline.compile() pipeline.fit() return pretrained_model_path if __name__ == "__main__": parser = argparse.ArgumentParser("LLMSUITE PIPELINE JOB") parser.add_argument("-c", "--config", type=str, help="config file", default="../../config.yaml") parser.add_argument("-p", "--param", type=str, help="config file for params", default="./bloom_lora_config.yaml") args = parser.parse_args() main(args.config, args.param) ================================================ FILE: examples/pellm/test_pellm_llmsuite.yaml ================================================ data: - file: examples/data/AdvertiseGen/train.json table_name: ad namespace: experiment role: guest_0 - file: examples/data/AdvertiseGen/train.json table_name: ad namespace: experiment role: host_0 bloom_lora_vs_zero_shot: bloom_lora: pretrained: "bloom-560m" script: "./test_bloom_lora.py" conf: "./bloom_lora_config.yaml" peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory" tasks: - "advertise-gen" bloom_zero_shot: pretrained: "bloom-560m" tasks: - "advertise-gen" ================================================ FILE: python/MANIFEST.in ================================================ include fate_llm/dataset/data_config/*yaml include python/fate_llm/evaluate/tasks/*/*yaml ================================================ FILE: python/fate_llm/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/dp/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from .opacus_compatibility.transformers_compate import get_model_class from .dp_trainer import DPTrainer, DPTrainingArguments ================================================ FILE: python/fate_llm/algo/dp/dp_trainer.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import opacus import os import torch from dataclasses import dataclass, field from transformers.training_args_seq2seq import Seq2SeqTrainingArguments from torch.utils.data import DataLoader from tqdm import tqdm from typing import Optional, Callable from .opacus_compatibility import add_layer_compatibility, add_optimizer_compatibility from .opacus_compatibility.transformers_compate import prepare_position_ids logger = logging.getLogger(__name__) @dataclass class DPTrainingArguments(Seq2SeqTrainingArguments): target_epsilon: float = field(default=3) target_delta: float = field(default=1e-5) freeze_embedding: bool = field(default=True) device_id: int = field(default=0) class DPTrainer(object): def __init__( self, model: torch.nn.Module, training_args: DPTrainingArguments, train_set, loss_fn, optimizer: torch.optim.Optimizer = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, data_collator: Callable = None, use_tqdm: bool = False, ): self.module = model self.training_args = training_args self.ori_optimizer = optimizer self.lr_scheduler = scheduler self.train_set = train_set self.data_collator = data_collator self.loss_fn = loss_fn self.use_tqdm = use_tqdm self.data_loader = DataLoader( dataset=self.train_set, shuffle=True, batch_size=self.training_args.per_device_train_batch_size, collate_fn=self.data_collator ) if not self.training_args.use_cpu: self.module.cuda(self.training_args.device_id) if self.training_args.freeze_embedding: self.freeze_model_embedding() self.dp_model = None self.dp_optimizer = None self.privacy_engine = None self._init_dp_model() def _init_dp_model(self): self.module.train() # add compatibility for layer hooks add_layer_compatibility(opacus) self.privacy_engine = opacus.PrivacyEngine(accountant="rdp") self.dp_model, self.dp_optimizer, _ = self.privacy_engine.make_private_with_epsilon( module=self.module, optimizer=self.ori_optimizer, data_loader=self.data_loader, target_delta=self.training_args.target_delta, target_epsilon=self.training_args.target_epsilon, max_grad_norm=self.training_args.max_grad_norm, epochs=int(self.training_args.num_train_epochs), ) add_optimizer_compatibility(self.dp_optimizer) def train(self): logger.info(f"begin dp training, total epochs={self.training_args.num_train_epochs}") for epoch in range(int(self.training_args.num_train_epochs)): logger.info(f"dp training on epoch={epoch}") self._train_an_epoch() def _train_an_epoch(self): if self.use_tqdm: data_loader = tqdm(self.data_loader) else: data_loader = self.data_loader for batch_idx, batch_data in enumerate(tqdm(data_loader)): input_ids = batch_data["input_ids"] labels = batch_data["labels"] if "attention_mask" not in batch_data: attention_mask = torch.ones(input_ids.shape) else: attention_mask = batch_data["attention_mask"] if not self.training_args.use_cpu: input_ids = input_ids.to(self.module.device) labels = labels.to(self.module.device) attention_mask = attention_mask.to(self.module.device) inputs = self._prepare_batch_input(input_ids) logits = self.dp_model(**inputs).logits loss = self.loss_fn(logits, labels, attention_mask) loss = loss.mean() loss.backward() if (batch_idx + 1) % self.training_args.gradient_accumulation_steps == 0 or \ batch_idx + 1 == len(self.data_loader): self.dp_optimizer.step() if self.lr_scheduler is not None: self.lr_scheduler.step() self.dp_optimizer.zero_grad() else: self.dp_optimizer.step() self.dp_optimizer.zero_grad() def _prepare_batch_input(self, input_ids) -> dict: position_ids = prepare_position_ids(self.module, input_ids) if not self.training_args.use_cpu: position_ids = position_ids.to(self.module.device) return dict(input_ids=input_ids, position_ids=position_ids) def freeze_model_embedding(self): self.module.get_input_embeddings().requires_grad_(False) def save_model( self, output_dir="./" ): if hasattr(self.module, "save_pretrained"): self.module.save_pretrained(output_dir) else: if not os.path.exists(output_dir): os.makedirs(output_dir) torch.save(self.module.state_dict(), output_dir + '/pytorch_model.bin') ================================================ FILE: python/fate_llm/algo/dp/opacus_compatibility/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from .grad_sample.embedding import compute_embedding_grad_sample from .optimizers.optimizer import add_noise_wrapper def add_layer_compatibility(opacus): replace_method = [] for k, v in opacus.GradSampleModule.GRAD_SAMPLERS.items(): if v.__name__ == "compute_embedding_grad_sample": replace_method.append(k) for k in replace_method: opacus.GradSampleModule.GRAD_SAMPLERS[k] = compute_embedding_grad_sample def add_optimizer_compatibility(optimizer): add_noise_wrapper(optimizer) ================================================ FILE: python/fate_llm/algo/dp/opacus_compatibility/grad_sample/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/algo/dp/opacus_compatibility/grad_sample/embedding.py ================================================ # # Copyright (c) Meta Platforms, Inc. and affiliates. # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch import torch.nn as nn from typing import Dict # the function is modified from https://github.com/pytorch/opacus/blob/main/opacus/grad_sample/embedding.py#L25, # avoid dtype error when backprops's dtype isn't torch.float32 def compute_embedding_grad_sample( layer: nn.Embedding, activations: torch.Tensor, backprops: torch.Tensor ) -> Dict[nn.Parameter, torch.Tensor]: """ Computes per sample gradients for ``nn.Embedding`` layer. Args: layer: Layer activations: Activations backprops: Backpropagations """ activations = activations[0] ret = {} if layer.weight.requires_grad: saved = torch.backends.cudnn.deterministic torch.backends.cudnn.deterministic = True batch_size = activations.shape[0] if batch_size == 0: ret[layer.weight] = torch.zeros_like(layer.weight).unsqueeze(0) return ret index = ( activations.unsqueeze(-1) .expand(*activations.shape, layer.embedding_dim) .reshape(batch_size, -1, layer.embedding_dim) ) grad_sample = torch.zeros( batch_size, *layer.weight.shape, device=layer.weight.device, dtype=backprops.dtype ) grad_sample.scatter_add_( 1, index, backprops.reshape(batch_size, -1, layer.embedding_dim) ) torch.backends.cudnn.deterministic = saved ret[layer.weight] = grad_sample return ret ================================================ FILE: python/fate_llm/algo/dp/opacus_compatibility/optimizers/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/algo/dp/opacus_compatibility/optimizers/optimizer.py ================================================ # # Copyright (c) Meta Platforms, Inc. and affiliates. # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import types from opacus.optimizers.optimizer import ( _check_processed_flag, _generate_noise, _mark_as_processed ) # modified from https://github.com/pytorch/opacus/blob/main/opacus/optimizers/optimizer.py#L424 # avoid dtype error when summed_grad's dtype isn't torch.float32 def add_noise(self): """ Adds noise to clipped gradients. Stores clipped and noised result in ``p.grad`` """ for p in self.params: _check_processed_flag(p.summed_grad) noise = _generate_noise( std=self.noise_multiplier * self.max_grad_norm, reference=p.summed_grad, generator=self.generator, secure_mode=self.secure_mode, ) noise = noise.to(p.summed_grad.dtype) p.grad = (p.summed_grad + noise).view_as(p) _mark_as_processed(p.summed_grad) def add_noise_wrapper(optimizer): optimizer.add_noise = types.MethodType(add_noise, optimizer) ================================================ FILE: python/fate_llm/algo/dp/opacus_compatibility/transformers_compate.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch import transformers from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM from transformers.modeling_utils import unwrap_model def get_model_class(model): if isinstance(model, PELLM): model = model._pe_lm model = unwrap_model(model) return model.__class__ def prepare_position_ids(model, input_ids): if get_model_class(model) == transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel: return _get_position_ids_for_gpt2(input_ids) else: raise ValueError(f"Can not prepare position_ids for model_type={model.__class__}") def _get_position_ids_for_gpt2(input_ids): past_length = 0 position_ids = torch.arange(past_length, input_ids.shape[-1] + past_length, dtype=torch.long, device=input_ids.device) position_ids = position_ids.unsqueeze(0) position_ids = position_ids.repeat(input_ids.shape[0], 1) return position_ids ================================================ FILE: python/fate_llm/algo/fdkt/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from .fdkt_data_aug import ( FDKTSLM, FDKTLLM, FDKTTrainingArguments ) __all__ = [ "FDKTSLM", "FDKTLLM", "FDKTTrainingArguments" ] ================================================ FILE: python/fate_llm/algo/fdkt/cluster/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/algo/fdkt/cluster/cluster.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from typing import List from .cluster_method import get_cluster_runner class SentenceCluster(object): def __init__(self, model, cluster_method="kmeans", n_clusters=8, **other_cluster_args): self.model = model self.cluster_method = cluster_method self.n_clusters = n_clusters self.other_cluster_args = other_cluster_args def get_embeddings(self, sentences: List[str]): return self.model.encode(sentences) def cluster(self, sentences): embeddings = self.get_embeddings(sentences) cluster_runner = get_cluster_runner(method=self.cluster_method, n_clusters=self.n_clusters, **self.other_cluster_args) cluster_rets = cluster_runner.fit(embeddings) return cluster_rets ================================================ FILE: python/fate_llm/algo/fdkt/cluster/cluster_method.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from sklearn.cluster import KMeans class KMeansRunner(object): def __init__(self, n_clusters, **other_cluster_args): self.n_clusters = n_clusters self.other_cluster_args = other_cluster_args def fit(self, x): model = KMeans(n_clusters=self.n_clusters, **self.other_cluster_args) model.fit(x) return model.labels_ def get_cluster_runner(method, n_clusters, **other_cluster_args): if method.lower() == "kmeans": return KMeansRunner(n_clusters, **other_cluster_args) else: raise ValueError(f"cluster method={method} is not implemented") ================================================ FILE: python/fate_llm/algo/fdkt/fdkt_data_aug.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os.path import shutil import torch import logging from dataclasses import dataclass, field from ...trainer.seq2seq_trainer import Seq2SeqTrainingArguments from typing import Optional, Callable from fate.arch import Context from transformers import PreTrainedTokenizer from .utils.invalid_data_filter import filter_invalid_data from .utils.text_generate import slm_text_generate, general_text_generate from .cluster.cluster import SentenceCluster from fate_llm.inference.inference_base import Inference logger = logging.getLogger(__name__) SLM_SYNTHETIC_DATA = "slm_synthetic_data" LLM_AUG_DATA = "llm_aug_data" @dataclass class FDKTTrainingArguments(Seq2SeqTrainingArguments): """ slm parameters """ dp_training: bool = field(default=True) target_epsilon: float = field(default=3) target_delta: float = field(default=1e-5) freeze_embedding: bool = field(default=True) device_id: int = field(default=0) slm_generation_config: dict = field(default=None) slm_generation_batch_size: dict = field(default=None) inference_method: str = field(default="native") inference_inst_init_conf: dict = field(default=None) """ slm generation config """ seq_num_for_single_category: int = field(default=None) """ dp loss params """ label_smoothing_factor = 0.02 loss_reduce = True """ llm parameters """ sample_num_per_cluster: int = field(default=None) filter_data_batch_size: int = field(default=2) filter_prompt_max_length: int = field(default=2048) filter_generation_config: dict = field(default=None) aug_generation_config: dict = field(default=None) aug_prompt_num: int = field(default=None) aug_data_batch_size: int = field(default=2) aug_prompt_max_length: int = field(default=2048) def to_dict(self): from dataclasses import fields from enum import Enum d = {field.name: getattr(self, field.name) for field in fields(self) if field.init} for k, v in d.items(): if isinstance(v, Enum): d[k] = v.value if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum): d[k] = [x.value for x in v] if k.endswith("_token"): d[k] = f"<{k.upper()}>" return d class FDKTSLM(object): def __init__( self, ctx: Context, model: torch.nn.Module, training_args: FDKTTrainingArguments, train_set, optimizer: torch.optim.Optimizer = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, data_collator: Callable = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): super(FDKTSLM, self).__init__() self.ctx = ctx self.training_args = training_args self.train_set = train_set self.model = model self.tokenizer = tokenizer self.optimizer = optimizer self.scheduler = scheduler self.data_collator = data_collator if not self.training_args.use_cpu: self.model.cuda(self.training_args.device_id) def aug_data(self): logging.info("Start aug data process") logging.debug(f"dp_training={self.training_args.dp_training}") if self.training_args.dp_training: logging.info("Start dp training") self.dp_train() logging.info("End dp training") inference_inst = self._create_inference_inst() prefix_prompt_dict = self.train_set.get_generate_prompt( tokenize=True if inference_inst is None else False) generated_texts = slm_text_generate( inference_inst, self.model, self.tokenizer, prompt_dict=prefix_prompt_dict, seq_num_for_single_category=self.training_args.seq_num_for_single_category, batch_size=self.training_args.slm_generation_batch_size, use_cpu=self.training_args.use_cpu, generation_config=self.training_args.slm_generation_config ) self._destroy_inference_inst() if not self.training_args.use_cpu: self.model.cpu() torch.cuda.empty_cache() generated_texts = filter_invalid_data(generated_texts) self.sync_synthetic_dataset(generated_texts) return self.sync_aug_data() def dp_train(self): from ..dp import DPTrainer, DPTrainingArguments, get_model_class from .utils.dp_loss import SequenceCrossEntropyLoss dp_training_args = DPTrainingArguments( target_delta=self.training_args.target_delta, target_epsilon=self.training_args.target_epsilon, freeze_embedding=self.training_args.freeze_embedding, device_id=self.training_args.device_id, num_train_epochs=self.training_args.num_train_epochs, per_device_train_batch_size=self.training_args.per_device_train_batch_size, output_dir="/" if self.training_args.output_dir is None else self.training_args.output_dir ) loss_fn = SequenceCrossEntropyLoss( get_model_class(self.model).__name__, label_smoothing=self.training_args.label_smoothing_factor, reduce=self.training_args.loss_reduce ) dp_trainer = DPTrainer( model=self.model, training_args=dp_training_args, train_set=self.train_set, optimizer=self.optimizer, scheduler=self.scheduler, data_collator=self.data_collator, loss_fn=loss_fn ) dp_trainer.train() def _create_inference_inst(self): if self.training_args.inference_method == "native": return None elif self.training_args.inference_method == "vllm": from .inference_inst import vllm_init self.model.cpu() model_temp_path = self.training_args.output_dir + "./model_for_inference" self.tokenizer.save_pretrained(model_temp_path) self.model.save_pretrained(model_temp_path) return vllm_init(model_temp_path) if self.training_args.inference_inst_init_conf is None \ else vllm_init(model_temp_path, **self.training_args.inference_inst_init_conf) else: raise ValueError(f"not supported inference_method={self.training_args.inference_method}") def _destroy_inference_inst(self): if self.training_args.inference_method == "vllm": shutil.rmtree(self.training_args.output_dir + "./model_for_inference") elif not self.training_args.use_cpu: self.model.cpu() def sync_synthetic_dataset(self, data): self.ctx.arbiter.put(SLM_SYNTHETIC_DATA, data) def sync_aug_data(self): return self.ctx.arbiter.get(LLM_AUG_DATA) def save_model( self, output_dir="./" ): if hasattr(self.model, "save_pretrained"): self.model.save_pretrained(output_dir) else: if not os.path.exists(output_dir): os.makedirs(output_dir) torch.save(self.model.state_dict(), output_dir + '/pytorch_model.bin') class FDKTLLM(object): def __init__( self, ctx: Context, embedding_model: torch.nn.Module, training_args: FDKTTrainingArguments, dataset, model: Optional[torch.nn.Module] = None, tokenizer: Optional[PreTrainedTokenizer] = None, inference_inst: Optional[Inference] = None, ): super(FDKTLLM, self).__init__() self.ctx = ctx self.inference_inst = inference_inst self.embedding_model = embedding_model self.dataset = dataset self.training_args = training_args self.model = model self.tokenizer = tokenizer if self.inference_inst is None and (self.model is None or self.tokenizer is None): raise ValueError("Inference_inst and Model are both empty, should provided one") if self.model is not None and self.training_args.device_id is not None and not self.training_args.use_cpu: self.model.cuda(self.training_args.device_id) def sync_synthetic_data(self): return self.ctx.guest.get(SLM_SYNTHETIC_DATA) def sync_aug_data(self, aug_data): self.ctx.guest.put(LLM_AUG_DATA, aug_data) def aug_data(self): logging.info("sync slm synthetic_data") slm_data = self.sync_synthetic_data() logging.info("filter slm synthetic data") filter_data = self.filter_data(slm_data) logging.info("prepare prompts for aug") aug_prompts = self.dataset.prepare_augment( filter_data["inputs"], filter_data["labels"], aug_prompt_num=self.training_args.aug_prompt_num ) logging.info("aug_data") aug_data = self._aug(aug_prompts) aug_data = filter_invalid_data(aug_data) self.sync_aug_data(aug_data) def _aug(self, aug_prompts): aug_responses = general_text_generate( inference_inst=self.inference_inst, model=self.model, tokenizer=self.tokenizer, generation_config=self.training_args.aug_generation_config, prompts=aug_prompts, batch_size=self.training_args.aug_data_batch_size, use_cpu=self.training_args.use_cpu, prompt_max_length=self.training_args.aug_prompt_max_length ) aug_data = self.dataset.abstract_from_augmented(aug_responses) return aug_data def filter_data(self, slm_data): clustered_sentences, clustered_labels = self.cluster_data(slm_data) filter_prompts = self.dataset.prepare_query_to_filter_clustered(clustered_sentences, clustered_labels) filter_responses = general_text_generate( inference_inst=self.inference_inst, model=self.model, tokenizer=self.tokenizer, generation_config=self.training_args.filter_generation_config, prompts=filter_prompts, batch_size=self.training_args.filter_data_batch_size, use_cpu=self.training_args.use_cpu, prompt_max_length=self.training_args.filter_prompt_max_length ) filtered_sentences, filtered_labels = self.dataset.parse_clustered_response( clustered_sentence=clustered_sentences, clustered_labels=clustered_labels, response_list=filter_responses ) return dict( inputs=filtered_sentences, labels=filtered_labels ) def cluster_data(self, slm_data): sentences = slm_data["inputs"] labels = slm_data["labels"] n_clusters = (len(sentences) + self.training_args.sample_num_per_cluster - 1) // self.training_args.sample_num_per_cluster cluster_ret = SentenceCluster(model=self.embedding_model, n_clusters=n_clusters).cluster(sentences) clustered_sentences = [[] for _ in range(n_clusters)] clustered_labels = [[] for _ in range(n_clusters)] for sentence_id, cluster_id in enumerate(cluster_ret): clustered_sentences[cluster_id].append(sentences[sentence_id]) clustered_labels[cluster_id].append(labels[sentence_id]) return clustered_sentences, clustered_labels ================================================ FILE: python/fate_llm/algo/fdkt/inference_inst.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # def api_init(api_url: str, model_name: str, api_key: str = 'EMPTY', api_timeout=3600): from fate_llm.inference.api import APICompletionInference return APICompletionInference( api_url=api_url, model_name=model_name, api_key=api_key, api_timeout=api_timeout ) def vllm_init(model_path: str, num_gpu=1, dtype='float16', gpu_memory_utilization=0.9): from fate_llm.inference.vllm import VLLMInference return VLLMInference( model_path=model_path, num_gpu=num_gpu, dtype=dtype, gpu_memory_utilization=gpu_memory_utilization ) ================================================ FILE: python/fate_llm/algo/fdkt/utils/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/algo/fdkt/utils/dp_loss.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch import torch.nn as nn import torch.nn.functional as F from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES NUMERICAL_STABILITY_CONSTANT = 1e-13 class SequenceCrossEntropyLoss(nn.Module): def __init__(self, model_type, label_smoothing=-1, reduce=None): super().__init__() self.model_type = model_type self.label_smoothing = label_smoothing self.reduce = reduce def forward(self, logits, targets, mask): return sequence_cross_entropy_with_logits(logits, targets, mask, self.label_smoothing, self.reduce, self.model_type) def sequence_cross_entropy_with_logits(logits, targets, mask, label_smoothing, reduce, model_type): if model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): logits = logits[:, :-1].contiguous() targets = targets[:, 1:] mask = torch.ones_like(targets).float() logits_flat = logits.view(-1, logits.size(-1)) log_probs_flat = F.log_softmax(logits_flat, dim=-1) targets_flat = targets.reshape(-1, 1).long() if label_smoothing > 0.0: num_classes = logits.size(-1) smoothing_value = label_smoothing / float(num_classes) one_hot_targets = torch.zeros_like(log_probs_flat).scatter_(-1, targets_flat, 1.0 - label_smoothing) smoothed_targets = one_hot_targets + smoothing_value negative_log_likelihood_flat = -log_probs_flat * smoothed_targets negative_log_likelihood_flat = negative_log_likelihood_flat.sum(-1, keepdim=True) else: negative_log_likelihood_flat = - torch.gather(log_probs_flat, dim=1, index=targets_flat) negative_log_likelihood = negative_log_likelihood_flat.view(-1, logits.shape[1]) loss = negative_log_likelihood * mask if reduce: loss = loss.sum(1) / (mask.sum(1) + NUMERICAL_STABILITY_CONSTANT) if reduce is "batch": loss = loss.mean() return loss ================================================ FILE: python/fate_llm/algo/fdkt/utils/invalid_data_filter.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # INVALID_CHARACTERS = "".join([' ', '-', '.', '_', '~', '/', '\\', '*', '|', '#']) LEAST_WORDS = 10 def filter_invalid_data(data_dict): sample_num = len(data_dict["inputs"]) new_data_dict = dict( inputs=list(), labels=list() ) for idx in range(sample_num): text = data_dict["inputs"][idx].strip(INVALID_CHARACTERS) if len(text.split()) < LEAST_WORDS: continue new_data_dict["inputs"].append(text) new_data_dict["labels"].append(data_dict["labels"][idx]) return new_data_dict ================================================ FILE: python/fate_llm/algo/fdkt/utils/text_generate.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from tqdm import tqdm from typing import Any, Dict, List def slm_text_generate( inference_inst, model, tokenizer, prompt_dict, seq_num_for_single_category, batch_size, use_cpu, generation_config ): generated_ret = dict( inputs=list(), labels=list(), ) if inference_inst is not None: for label, prompt in prompt_dict.items(): generated_sequences = inference_inst.inference([prompt] * seq_num_for_single_category, generation_config) for g in generated_sequences: generated_ret["inputs"].append(g) generated_ret["labels"].append(label) else: model.eval() for label, prompt_ids in prompt_dict.items(): prompt_length = len(prompt_ids) batch_num = (seq_num_for_single_category + batch_size - 1) // batch_size for batch_idx in tqdm(range(batch_num)): if batch_idx + 1 == batch_num: cur_batch_size = seq_num_for_single_category - batch_idx * batch_size else: cur_batch_size = batch_size input_ids = prompt_ids.repeat(cur_batch_size, 1) if not use_cpu: input_ids = input_ids.to(model.device) output_sequences = model.generate( input_ids=input_ids, **generation_config ) output_sequences = output_sequences[:, prompt_length:] generated_sequences = tokenizer.batch_decode(output_sequences, skip_special_tokens=True) for g in generated_sequences: generated_ret["inputs"].append(g) generated_ret["labels"].append(label) return generated_ret def general_text_generate( inference_inst, model, tokenizer, generation_config: Dict[Any, Any], prompts: List[str], batch_size, use_cpu: bool, prompt_max_length ): if inference_inst is not None: if prompt_max_length is not None: prompts = [prompt[:prompt_max_length] for prompt in prompts] generate_texts = inference_inst.inference(prompts, generation_config) else: model.eval() generate_texts = [] batch_num = (len(prompts) + batch_size - 1) // batch_size for batch_idx in range(batch_num): batch_data = prompts[batch_idx * batch_size: (batch_idx + 1) * batch_size] inputs = tokenizer(batch_data, return_tensors="pt", padding="longest", truncation=True, max_length=prompt_max_length) input_ids = inputs["input_ids"] attention_mask = inputs["attention_mask"] if not use_cpu: input_ids = input_ids.to(model.device) attention_mask = attention_mask.to(model.device) output = model.generate( input_ids=input_ids, attention_mask=attention_mask, **generation_config ) batch_responses = tokenizer.batch_decode(output[:, input_ids.shape[1]:], skip_special_tokens=True) generate_texts.extend(batch_responses) return generate_texts ================================================ FILE: python/fate_llm/algo/fedavg/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/fedavg/fedavg.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch from fate.ml.nn.homo.fedavg import FedAVGServer, FedAVGArguments, FedArguments from fate.arch import Context from fate_llm.trainer.seq2seq_trainer import HomoSeq2SeqTrainerClient, Seq2SeqTrainingArguments from fate.ml.aggregator import AggregatorClientWrapper import logging from typing import List, Optional, Tuple, Callable, Dict from fate.arch import Context from torch.optim import Optimizer from torch.utils.data import Dataset from torch.optim.lr_scheduler import _LRScheduler from transformers.trainer_callback import TrainerCallback from torch import nn from torch.utils.data import DataLoader from transformers import TrainerState, TrainerControl, PreTrainedTokenizer, EvalPrediction logger = logging.getLogger(__name__) Seq2SeqFedAVGServer = FedAVGServer class Seq2SeqFedAVGClient(HomoSeq2SeqTrainerClient): def __init__( self, ctx: Context, model: nn.Module, training_args: Seq2SeqTrainingArguments, fed_args: FedArguments, train_set: Dataset, val_set: Dataset = None, optimizer: torch.optim.Optimizer = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, data_collator: Callable = None, tokenizer: Optional[PreTrainedTokenizer] = None, callbacks: Optional[List[TrainerCallback]] = [], compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, local_mode: bool = False, save_trainable_weights_only: bool = False, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, ): # in case you forget to set evaluation_strategy if val_set is not None and training_args.evaluation_strategy == "no": training_args.evaluation_strategy = "epoch" HomoSeq2SeqTrainerClient.__init__( self, ctx, model, training_args, fed_args, train_set, val_set, optimizer, data_collator, scheduler, tokenizer, callbacks, compute_metrics, local_mode, save_trainable_weights_only, preprocess_logits_for_metrics ) def init_aggregator(self, ctx: Context, fed_args: FedArguments): aggregate_type = "weighted_mean" aggregator_name = "fedavg" aggregator = fed_args.aggregator return AggregatorClientWrapper( ctx, aggregate_type, aggregator_name, aggregator, sample_num=len(self.train_dataset), args=self._args ) def on_federation( self, ctx: Context, aggregator: AggregatorClientWrapper, fed_args: FedArguments, args: Seq2SeqTrainingArguments, model: Optional[nn.Module] = None, optimizer: Optional[Optimizer] = None, scheduler: Optional[_LRScheduler] = None, dataloader: Optional[Tuple[DataLoader]] = None, control: Optional[TrainerControl] = None, state: Optional[TrainerState] = None, **kwargs, ): aggregator.model_aggregation(ctx, model) ================================================ FILE: python/fate_llm/algo/fedcollm/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/fedcollm/fedcollm.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch import logging from fate_llm.algo.fedcollm.fedcollm_trainer import FedCoLLMTrainer from typing import Dict, Optional, List, Callable, Union from fate.arch import Context from fate.ml.nn.trainer.trainer_base import FedArguments from torch.utils.data import Dataset from transformers.trainer_callback import TrainerCallback from transformers import PreTrainedTokenizer from transformers import Seq2SeqTrainer from transformers.trainer_utils import EvalPrediction from transformers.modeling_utils import PreTrainedModel from transformers.modeling_utils import unwrap_model from fate_llm.algo.fedmkt.utils.generate_logit_utils import generate_pub_data_logits from fate.ml.aggregator import AggregatorClientWrapper, AggregatorServerWrapper from fate_llm.algo.fedcollm.fedcollm_training_args import FedCoLLMTrainingArguments from types import SimpleNamespace logger = logging.getLogger(__name__) class FedCoLLMBase(object): @staticmethod def update_model(model, updated_params): for updated_p, p in zip(updated_params, [p for p in model.parameters() if p.requires_grad]): p.data.copy_(t.Tensor(updated_p)) class SLM(FedCoLLMBase): def __init__( self, ctx: Context, model: torch.nn.Module, training_args: FedCoLLMTrainingArguments, fed_args: FedArguments = None, train_set=None, val_set: Dataset = None, optimizer: torch.optim.Optimizer = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, data_collator: Callable = None, tokenizer: Optional[PreTrainedTokenizer] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = [], save_trainable_weights_only: bool = False, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, ): super(SLM, self).__init__() self.ctx = ctx self.training_args = training_args self.fed_args = fed_args self.model = model self.tokenizer = tokenizer self.model_init = model_init self.callbacks = callbacks self.compute_metrics = compute_metrics self.save_trainable_weights_only = save_trainable_weights_only self.preprocess_logits_for_metrics = preprocess_logits_for_metrics self.data_collator = data_collator self.optimizer = optimizer self.scheduler = scheduler self.train_set = train_set self.val_set = val_set self.aggregator = self._init_aggregator(ctx, fed_args) def train(self): global_epochs = self.training_args.global_epochs for i, iter_ctx in self.ctx.on_iterations.ctxs_range(global_epochs): logger.info(f"begin {i}-th global kd process") training_args = self._get_slm_training_args() trainer = Seq2SeqTrainer( model=self.model, tokenizer=self.tokenizer, data_collator=self.data_collator, train_dataset=self.train_set, args=training_args, model_init=self.model_init if not i else None, compute_metrics=self.compute_metrics, callbacks=self.callbacks, optimizers=(self.optimizer, self.scheduler), preprocess_logits_for_metrics=self.preprocess_logits_for_metrics ) logger.info(f"begin {i}-th private data training process") trainer.train() self.model = unwrap_model(trainer.model) self.aggregator.model_aggregation(iter_ctx, self.model) def _sync_slm_updated_params(self, iter_ctx): updated_params = iter_ctx.arbiter.get("slm_updated_params") self.update_model(self.model, updated_params) def _get_slm_training_args(self): return self.training_args.to_slm_seq_training_args() def _init_aggregator(self, ctx: Context, fed_args: FedArguments): aggregate_type = "weighted_mean" aggregator_name = "fedavg" aggregator = fed_args.aggregator return AggregatorClientWrapper( ctx, aggregate_type, aggregator_name, aggregator, sample_num=len(self.train_set), args=self.training_args ) class LLM(FedCoLLMBase): def __init__( self, ctx: Context, llm_model: torch.nn.Module, slm_model: torch.nn.Module, training_args: FedCoLLMTrainingArguments, fed_args: FedArguments = None, train_set=None, val_set: Dataset = None, llm_optimizer: torch.optim.Optimizer = None, slm_optimizer: torch.optim.Optimizer = None, llm_lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, slm_lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, data_collator: Callable = None, tokenizer: Optional[PreTrainedTokenizer] = None, llm_model_init: Optional[Callable[[], PreTrainedModel]] = None, slm_model_init: Optional[Callable[[], PreTrainedModel]] = None, llm_compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, slm_compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, llm_callbacks: Optional[List[TrainerCallback]] = [], slm_callbacks: Optional[List[TrainerCallback]] = [], save_trainable_weights_only: bool = False, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, ): super(LLM, self).__init__() self.ctx = ctx self.llm_model = llm_model self.slm_model = slm_model self.training_args = training_args self.fed_args = fed_args self.train_set = train_set self.val_set = val_set self.llm_optimizer = llm_optimizer self.slm_optimizer = slm_optimizer self.llm_lr_scheduler = llm_lr_scheduler self.slm_lr_scheduler = slm_lr_scheduler self.data_collator = data_collator self.tokenizer = tokenizer self.llm_model_init = llm_model_init self.slm_model_init = slm_model_init self.llm_compute_metrics = llm_compute_metrics self.slm_compute_metrics = slm_compute_metrics self.llm_callbacks = llm_callbacks self.slm_callbacks = slm_callbacks self.save_trainable_weights_only = save_trainable_weights_only self.preprocess_logits_for_metrics = preprocess_logits_for_metrics self.aggregator = self._init_aggregator(ctx) def _init_aggregator(self, ctx: Context): return AggregatorServerWrapper(ctx) def _get_logits(self, model): if self.training_args.device.type == "cuda": model.cuda(self.training_args.device.type) fn_kwargs = {"model": model, "training_args": self.training_args, "data_collator": self.data_collator} return self.train_set.map( generate_pub_data_logits, batched=True, batch_size=self.training_args.per_device_train_batch_size, num_proc=None, load_from_cache_file=True, fn_kwargs=fn_kwargs ) def on_epoch_begin(self, iter_ctx): self.aggregator.model_aggregation(iter_ctx) updated_slm_params = iter_ctx() self.update_model(self.slm_model, updated_slm_params) def _sync_slm_updated_params(self, iter_ctx): updated_params = [p for p in self.slm_model.parameters() if p.requires_grad] iter_ctx.guest.put("slm_updated_params", updated_params) if any(p.role == 'host' for p in self.ctx.parties): iter_ctx.hosts.put("slm_updated_params", updated_params) def _train_slm(self, iter_ctx, llm_pub_logits, epoch_idx): top_k_args = SimpleNamespace( top_k_logits_keep=self.training_args.top_k_logits_keep, top_k_strategy=self.training_args.top_k_strategy ) self.train_set.set_return_with_idx() trainer = FedCoLLMTrainer( model=self.slm_model, tokenizer=self.tokenizer, data_collator=self.data_collator, train_dataset=self.train_set, args=self.training_args.to_slm_seq_training_args(), model_init=self.slm_model_init if not epoch_idx else None, compute_metrics=self.slm_compute_metrics, callbacks=self.slm_callbacks, optimizers=(self.slm_optimizer, self.slm_lr_scheduler), preprocess_logits_for_metrics=self.preprocess_logits_for_metrics, top_k_args=top_k_args, distill_lambda=self.training_args.distill_lambda, distill_temperature=self.training_args.distill_temperature, max_length=max(len(d["input_ids"]) for d in self.train_set), vocab_size=self.training_args.vocab_size, dtype=next(self.slm_model.parameters()).dtype, other_logits=llm_pub_logits ) trainer.train() self.slm_model = unwrap_model(trainer.model) self.train_set.reset_return_with_idx() self._sync_slm_updated_params(iter_ctx) def _train_llm(self, slm_pub_logits, epoch_idx): top_k_args = SimpleNamespace( top_k_logits_keep=self.training_args.top_k_logits_keep, top_k_strategy=self.training_args.top_k_strategy ) self.train_set.set_return_with_idx() trainer = FedCoLLMTrainer( model=self.llm_model, tokenizer=self.tokenizer, data_collator=self.data_collator, train_dataset=self.train_set, args=self.training_args.to_llm_seq_training_args(), model_init=self.llm_model_init if not epoch_idx else None, compute_metrics=self.llm_compute_metrics, callbacks=self.llm_callbacks, optimizers=(self.llm_optimizer, self.llm_lr_scheduler), preprocess_logits_for_metrics=self.preprocess_logits_for_metrics, top_k_args=top_k_args, distill_lambda=self.training_args.distill_lambda, distill_temperature=self.training_args.distill_temperature, max_length=max(len(d["input_ids"]) for d in self.train_set), vocab_size=self.training_args.vocab_size, dtype=next(self.slm_model.parameters()).dtype, other_logits=slm_pub_logits ) trainer.train() self.llm_model = unwrap_model(trainer.model) self.train_set.reset_return_with_idx() def train(self): global_epochs = self.training_args.global_epochs for i, iter_ctx in self.ctx.on_iterations.ctxs_range(global_epochs): logger.info(f"begin {i}-th global kd process") self.on_epoch_begin(iter_ctx) logger.info(f"get pub data logits for llm of global epoch={i}") llm_pub_data_logits = self._get_logits(self.llm_model) logger.info(f"train slm of global epoch={i}") self._train_slm(iter_ctx, llm_pub_data_logits, i) logger.info(f"get pub data logits for trained slm of global epoch={i}") slm_pub_data_logits = self._get_logits(self.slm_model) logger.info(f"train llm of global epoch={i}") self._train_llm(slm_pub_data_logits, i) ================================================ FILE: python/fate_llm/algo/fedcollm/fedcollm_trainer.py ================================================ # # NOTE: The implementations of FedMKTTrainer is modified from FuseAI/FuseLLM # Copyright FuseAI # # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import torch from torch.nn.functional import kl_div, log_softmax, softmax from transformers import Seq2SeqTrainer from fate_llm.algo.fedmkt.utils.generate_logit_utils import LogitsSelection from fate_llm.algo.fedmkt.utils.vars_define import ( PER_STEP_LOGITS, PER_STEP_INDICES, ) from types import SimpleNamespace logger = logging.getLogger(__name__) def computing_kd_loss(src_logits, dst_logits, loss_mask): src_logits = src_logits[loss_mask] dst_logits = dst_logits[loss_mask] return kl_div( log_softmax(src_logits, dim=-1, dtype=torch.float32), dst_logits, log_target=False, reduction="none").sum(dim=-1) def recovery_logits( top_k_logits, top_k_indices, batch_size, max_length, vocab_size, dtype, device, pad_id, distill_temperature ): logits = torch.zeros(batch_size, max_length, vocab_size).to(dtype).to(device) for i in range(batch_size): base_seq_len = len(top_k_logits[i]) for j in range(max_length): if j < base_seq_len: base_logits = torch.tensor(top_k_logits[i][j], dtype=dtype) base_prob = softmax(base_logits / distill_temperature, -1) base_indices = torch.tensor(top_k_indices[i][j]) base_prob = base_prob.to(device) base_indices = base_indices.cuda(device) logits[i][j] = logits[i][j].scatter_(-1, base_indices, base_prob) else: # padding position logits[i][j][pad_id] = 1.0 return logits class FedCoLLMTrainer(Seq2SeqTrainer): distill_lambda: float = 1.0 distill_temperature: float = 1.0 other_logits = None dtype: torch.dtype = torch.bfloat16 vocab_size: int = None max_length: int = None top_k_args: SimpleNamespace = None def __init__(self, **kwargs): distill_lambda = kwargs.pop("distill_lambda", 1.0) distill_temperature = kwargs.pop("distill_temperature", 1.0) other_logits = kwargs.pop("other_logits") vocab_size = kwargs.pop("vocab_size") max_length = kwargs.pop("max_length") top_k_args = kwargs.pop("top_k_args") super(FedCoLLMTrainer, self).__init__(**kwargs) self.distill_lambda = distill_lambda self.distill_temperature = distill_temperature self.other_logits = other_logits self.pad_id = self.tokenizer.pad_token_id self.vocab_size = vocab_size self.max_length = max_length self.top_k_args = top_k_args def compute_loss(self, model, inputs, return_outputs=False): lm_outputs = model(**inputs['inputs']) lm_loss = lm_outputs.loss logits = lm_outputs.logits other_logits = self.other_logits[inputs["indexes"]] batch_size = logits.shape[0] top_k_logits, top_k_indices = LogitsSelection.select_logits(logits, self.top_k_args) dst_logits = recovery_logits( other_logits[PER_STEP_INDICES], other_logits[PER_STEP_INDICES], batch_size, self.max_length, self.vocab_size, self.dtype, logits.device, self.pad_id, self.distill_temperature ) src_logits = recovery_logits( top_k_logits, top_k_indices, batch_size, self.max_length, self.vocab_size, self.dtype, logits.device, self.pad_id, self.distill_temperature ) loss_mask = (inputs["inputs"]["labels"] != -100) kl_loss = computing_kd_loss(src_logits, dst_logits, loss_mask=loss_mask).sum() return lm_loss + self.distill_lambda * kl_loss ================================================ FILE: python/fate_llm/algo/fedcollm/fedcollm_training_args.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from dataclasses import dataclass, field from ...trainer.seq2seq_trainer import Seq2SeqTrainingArguments @dataclass class FedCoLLMTrainingArguments(Seq2SeqTrainingArguments): """ top-k logits select params """ top_k_logits_keep: int = field(default=128) top_k_strategy: str = field(default="highest") vocab_size: int = field(default=None) """ distillation params """ distill_lambda: float = field(default=1.0) distill_temperature: float = field(default=1.0) server_public_data_local_epoch: int = field(default=1) client_public_data_local_epoch: int = field(default=1) client_priv_data_local_epoch: int = field(default=1) global_epochs: int = field(default=1) extra_args = ["top_k_logits_keep", "top_k_strategy", "vocab_size", "distill_lambda", "distill_temperature", "server_public_data_local_epoch", "client_public_data_local_epoch", "client_priv_data_local_epoch", "global_epochs"] def to_dict(self): from dataclasses import fields from enum import Enum d = {field.name: getattr(self, field.name) for field in fields(self) if field.init} for k, v in d.items(): if isinstance(v, Enum): d[k] = v.value if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum): d[k] = [x.value for x in v] if k.endswith("_token"): d[k] = f"<{k.upper()}>" return d def _pop_extra(self): args = self.to_dict() for arg in self.extra_args: args.pop(arg) return args def to_slm_seq_training_args(self): args = self._pop_extra() args["num_train_epochs"] = self.client_priv_data_local_epoch return Seq2SeqTrainingArguments(**args) def to_fedco_slm_training_args(self): args = self._pop_extra() args["num_train_epochs"] = self.client_pub_data_local_epoch return Seq2SeqTrainingArguments(**args) def to_fedco_llm_training_args(self): args = self._pop_extra() args["num_train_epochs"] = self.server_pub_data_local_epoch return Seq2SeqTrainingArguments(**args) ================================================ FILE: python/fate_llm/algo/fedcot/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/fedcot/encoder_decoder/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/fedcot/encoder_decoder/init/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/fedcot/encoder_decoder/init/default_init.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.algo.inferdpt.init._init import InferInit from fate_llm.inference.api import APICompletionInference from fate_llm.algo.fedcot.encoder_decoder.slm_encoder_decoder import SLMEncoderDecoderClient, SLMEncoderDecoderServer class FedCoTEDAPIClientInit(InferInit): api_url = '' api_model_name = '' api_key = 'EMPTY' def __init__(self, ctx): super().__init__(ctx) self.ctx = ctx def get_inst(self): inference = APICompletionInference(api_url=self.api_url, model_name=self.api_model_name, api_key=self.api_key) client = SLMEncoderDecoderClient(self.ctx, inference) return client class FedCoTEDAPIServerInit(InferInit): api_url = '' api_model_name = '' api_key = 'EMPTY' def __init__(self, ctx): super().__init__(ctx) self.ctx = ctx def get_inst(self): inference = APICompletionInference(api_url=self.api_url, model_name=self.api_model_name, api_key=self.api_key) return SLMEncoderDecoderServer(self.ctx, inference) ================================================ FILE: python/fate_llm/algo/fedcot/encoder_decoder/slm_encoder_decoder.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import copy from jinja2 import Template from tqdm import tqdm from fate.arch import Context from typing import List, Dict, Union from fate.ml.nn.dataset.base import Dataset from fate_llm.algo.inferdpt.utils import InferDPTKit from openai import OpenAI import logging from fate_llm.inference.inference_base import Inference from fate_llm.algo.inferdpt.inferdpt import InferDPTClient, InferDPTServer from fate_llm.dataset.hf_dataset import HuggingfaceDataset logger = logging.getLogger(__name__) class SLMEncoderDecoderClient(InferDPTClient): def __init__(self, ctx: Context, local_inference_inst: Inference) -> None: self.ctx = ctx self.comm_idx = 0 self.local_inference_inst = local_inference_inst self.local_inference_kwargs = {} def encode(self, docs: List[Dict[str, str]], format_template: str = None, verbose=False, perturb_doc_key: str ='perturbed_doc') -> List[Dict[str, str]]: template = Template(format_template) copy_docs = copy.deepcopy(docs) doc_to_infer = [] for doc in tqdm(copy_docs): rendered_doc = template.render(**doc) doc_to_infer.append(rendered_doc) # perturb using local model inference self.doc_to_infer = doc_to_infer infer_result = self.local_inference_inst.inference(doc_to_infer, self.local_inference_kwargs) for doc, pr in zip(copy_docs, infer_result): doc[perturb_doc_key] = pr self.doc_with_p = copy_docs return copy_docs def decode(self, p_docs: List[Dict[str, str]], instruction_template: str = None, decode_template: str = None, verbose=False, perturbed_response_key: str = 'perturbed_response', result_key: str = 'result', remote_inference_kwargs: dict = {}, local_inference_kwargs: dict = {}): return super().decode(p_docs, instruction_template, decode_template, verbose, perturbed_response_key, result_key, remote_inference_kwargs, local_inference_kwargs) def inference(self, docs: Union[List[Dict[str, str]], HuggingfaceDataset], encode_template: str, instruction_template: str, decode_template: str, verbose: bool = False, remote_inference_kwargs: dict = {}, local_inference_kwargs: dict = {}, perturb_doc_key: str = 'perturbed_doc', perturbed_response_key: str = 'perturbed_response', result_key: str = 'result', ) -> List[Dict[str, str]]: self.local_inference_kwargs = local_inference_kwargs return super().inference(docs, encode_template, instruction_template, decode_template, verbose, remote_inference_kwargs, \ local_inference_kwargs, perturb_doc_key, perturbed_response_key, result_key) class SLMEncoderDecoderServer(InferDPTServer): pass ================================================ FILE: python/fate_llm/algo/fedcot/fedcot_trainer.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import pickle import time from torch import nn from typing import List, Optional, Callable, Literal, Union from fate.arch import Context from torch.utils.data import DataLoader, Dataset from transformers.trainer_callback import TrainerCallback from transformers import PreTrainedTokenizer import logging import torch import torch.distributed as dist from fate_llm.dataset.fedcot_dataset import PrefixDataset from transformers.modeling_utils import unwrap_model from transformers import PreTrainedTokenizer, PreTrainedModel from typing import Dict, Any from transformers import Seq2SeqTrainingArguments from transformers.trainer_utils import EvalPrediction from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainer, Seq2SeqTrainingArguments from fate_llm.inference.inference_base import Inference from fate_llm.algo.inferdpt.inferdpt import InferDPTClient, InferDPTServer from fate_llm.algo.fedcot.encoder_decoder.slm_encoder_decoder import SLMEncoderDecoderClient, SLMEncoderDecoderServer logger = logging.getLogger(__name__) _MODE = ['train_only', 'infer_only', 'infer_and_train'] # share obj between ranks in an easy way def save_to(obj, filepath, filename='tmp.pkl'): if not os.path.exists(filepath): os.mkdir(filepath) path = filepath + filename with open(path, 'wb') as f: pickle.dump(obj, f) dist.barrier() os.remove(path) def load(filepath, filename='tmp.pkl'): path = filepath + filename while not os.path.exists(path): time.sleep(0.1) while True: try: with open(path, 'rb') as f: d = pickle.load(f) break except (EOFError, pickle.UnpicklingError): time.sleep(0.1) dist.barrier() return d class DSSTrainerClient(Seq2SeqTrainer): def __init__(self, model: nn.Module, training_args: Seq2SeqTrainingArguments, train_set: Dataset, val_set: Dataset = None, alpha: float = 0.5, optimizer: torch.optim.Optimizer = None, data_collator: Callable = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, tokenizer: Optional[PreTrainedTokenizer] = None, callbacks: Optional[List[TrainerCallback]] = [], compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None ) -> None: self.alpha = alpha Seq2SeqTrainer.__init__( self, model=model, args=training_args, train_dataset=train_set, eval_dataset=val_set, data_collator=data_collator, optimizers=(optimizer, scheduler), tokenizer=tokenizer, preprocess_logits_for_metrics=preprocess_logits_for_metrics, compute_metrics=compute_metrics, callbacks=callbacks, ) def compute_loss(self, model, inputs, return_outputs=False): label_outputs = model(**inputs['predict']) cot_outputs = model(**inputs['rationale']) loss = self.alpha * cot_outputs.loss + (1. - self.alpha) * label_outputs.loss return (loss, {'rationale_loss': cot_outputs, 'predict_loss': label_outputs}) if return_outputs else loss class FedCoTTrainerClient(DSSTrainerClient): def __init__(self, ctx: Context, training_args: Seq2SeqTrainingArguments, train_set: PrefixDataset, val_set: Dataset = None, model: nn.Module = None, optimizer: torch.optim.Optimizer = None, data_collator: Callable = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, tokenizer: Optional[PreTrainedTokenizer] = None, callbacks: Optional[List[TrainerCallback]] = [], compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, alpha: float = 0.5, mode: Literal['train_only', 'infer_only', 'infer_and_train'] = 'infer_and_train', infer_client: Union[SLMEncoderDecoderClient, InferDPTClient] = None, encode_template: str = None, instruction_template: str = None, decode_template: str = None, result_key: str = 'infer_result', verbose: bool = False, remote_inference_kwargs: dict = {}, local_inference_kwargs: dict = {}, tmp_data_share_path: str = None ) -> None: self.mode = mode self.infer_client = infer_client self.infer_result = None self.infer_predict_kwargs = { 'encode_template': encode_template, 'instruction_template': instruction_template, 'decode_template': decode_template, 'result_key': result_key, 'verbose': verbose, 'remote_inference_kwargs': remote_inference_kwargs, 'local_inference_kwargs': local_inference_kwargs } self.infer_result = None self.tmp_data_share_path = tmp_data_share_path assert mode in _MODE, "mode should be one of {}".format(_MODE) if training_args.local_rank == 0: if mode == 'infer_only' or mode == 'infer_and_train': if self.infer_client is None: raise ValueError('You must provide an inference instance for remote inference') if mode != 'infer_only': training_args.remove_unused_columns = False # this parameter is neccessary DSSTrainerClient.__init__( self, model=model, training_args=training_args, train_set=train_set, val_set=val_set, data_collator=data_collator, optimizer=optimizer, scheduler=scheduler, tokenizer=tokenizer, preprocess_logits_for_metrics=preprocess_logits_for_metrics, compute_metrics=compute_metrics, callbacks=callbacks, alpha=alpha ) else: # skip trainer initialzation becuase training is not needed self.args = training_args self.train_dataset = train_set def infer(self) -> List[str]: if self.args.local_rank == 0: # other rank will skip federation step assert isinstance(self.train_dataset, PrefixDataset), "train_set should be an instance of PrefixDataset" dict_dataset = self.train_dataset.get_raw_dataset() infer_result = self.infer_client.inference(dict_dataset, **self.infer_predict_kwargs) self.infer_result = infer_result rationale_list = [i[self.infer_predict_kwargs['result_key']] for i in self.infer_result] self.train_dataset.load_rationale(rationale_list, key=self.infer_predict_kwargs['result_key']) logger.info('infer done') if self.mode == 'infer_and_train': if self.args.world_size > 1: # sync dataset with other ranks tmp_path = self.tmp_data_share_path if self.tmp_data_share_path is not None else self.args.output_dir logger.info('scattering obj, save to temp path {}'.format(tmp_path)) save_to(rationale_list, tmp_path) if self.args.local_rank > 0: if self.mode == 'infer_and_train': # wait until infer is done tmp_path = self.tmp_data_share_path if self.tmp_data_share_path is not None else self.args.output_dir logger.info('waiting for obj, load frm temp path {}'.format(tmp_path)) rationale_list = load(tmp_path) self.train_dataset.load_rationale(rationale_list) logger.info('Rationale loaded') def train(self): if self.mode == 'train_only': logger.info("Train only mode") super().train() elif self.mode == 'infer_only': logger.info("infer only mode, skip training") self.infer() elif self.mode == 'infer_and_train': logger.info("infer and train mode") self.infer() super().train() def get_infer_result(self): return self.infer_result class FedCoTTraineServer(object): def __init__(self, ctx: Context, infer_server: Union[SLMEncoderDecoderServer, InferDPTServer]): super().__init__() self.ctx = ctx self.infer_server = infer_server def train(self): logger.info('Server side start inference') self.infer_server.inference() logger.info('Server inference done') if __name__ == '__main__': pass ================================================ FILE: python/fate_llm/algo/fedcot/slm_encoder_decoder_trainer.py ================================================ from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainer from transformers import DataCollatorForSeq2Seq from transformers import AutoTokenizer import pandas as pd class EDPrefixDataCollator(DataCollatorForSeq2Seq): def __call__(self, features, return_tensors=None): features_df = pd.DataFrame(features) a = super().__call__(list(features_df['encoder']), return_tensors) b = super().__call__(list(features_df['decoder']), return_tensors) return { 'encoder': a, 'decoder': b } class EncoderDecoderPrefixTrainer(Seq2SeqTrainer): def __init__(self, alpha=0.5, *args, **kwargs): super().__init__(*args, **kwargs) self.alpha = alpha def compute_loss(self, model, inputs, return_outputs=False): out_a = model(**inputs['encoder']) out_b = model(**inputs['decoder']) loss = self.alpha * out_a.loss + (1. - self.alpha) * out_b.loss return (loss, {'out_a': out_a, 'out_b': out_b}) if return_outputs else loss ================================================ FILE: python/fate_llm/algo/fedkseed/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/fedkseed/args.py ================================================ from dataclasses import dataclass, field @dataclass class KSeedTrainingArguments: """ TrainingArguments is the subset of the arguments we use in our example scripts, they are the arguments that Parameters: optim: optional, default is KSeedZO The optimizer to use. eps: optional, default is 0.0005 Epsilon value for KSeedZerothOrderOptimizer. grad_clip: optional, default is -100.0 Gradient clip value for KSeedZerothOrderOptimizer. """ zo_optim: bool = field( default=True, metadata={"help": "Whether to use KSeedZerothOrderOptimizer. This suppress `optim` argument when True."}, ) k: int = field( default=4096, metadata={"help": "The number of seed candidates to use. This suppress `seed_candidates` argument when > 1."}, ) eps: float = field(default=0.0005, metadata={"help": "Epsilon value for KSeedZerothOrderOptimizer."}) grad_clip: float = field(default=-100.0, metadata={"help": "Gradient clip value for KSeedZerothOrderOptimizer."}) ================================================ FILE: python/fate_llm/algo/fedkseed/fedkseed.py ================================================ import copy import logging from dataclasses import dataclass, field from typing import List, Mapping import torch from fate.arch.context import Context from fate_llm.algo.fedkseed.pytorch_utils import get_optimizer_parameters_grouped_with_decay from fate_llm.algo.fedkseed.trainer import KSeedZOExtendedTrainer from fate_llm.algo.fedkseed.zo_utils import probability_from_amps, directional_derivative_step, get_even_seed_probabilities from fate_llm.algo.fedkseed.args import KSeedTrainingArguments logger = logging.getLogger(__name__) class Trainer: def __init__( self, ctx: Context, seed_candidates: torch.LongTensor, args, fedkseed_args, ): self.ctx = ctx self.args = args self.fedkseed_args = fedkseed_args self.seed_candidates = seed_candidates self.k = len(seed_candidates) self.model = None @staticmethod def get_clients(ctx: Context): clients = [ctx.guest] try: clients.extend(ctx.hosts) except: pass return clients def load_model(self): raise NotImplementedError def train(self): direction_derivative_history = {seed.item(): [self.fedkseed_args.grad_initial] for seed in self.seed_candidates} direction_derivative_sum = None seed_probabilities = None for aggregation_iter, sub_ctx in self.ctx.ctxs_range(self.fedkseed_args.num_aggregations): # step1: re-calculate sample probabilities for each seed if seed_probabilities is None: seed_probabilities = get_even_seed_probabilities(self.k) else: seed_probabilities = probability_from_amps( [direction_derivative_history[seed.item()] for seed in self.seed_candidates], self.fedkseed_args.bias_loss_clip, ) # step2(rpc): remote call to the clients to get the directional derivative history # proposal for client in self.get_clients(sub_ctx): client.put( "train_once", ( False, { "seed_candidates": self.seed_candidates, "seed_probabilities": seed_probabilities, "direction_derivative_sum": direction_derivative_sum, }, ), ) if direction_derivative_sum is None: direction_derivative_sum = {seed.item(): 0.0 for seed in self.seed_candidates} # wait for reply and update the directional derivative history for client in self.get_clients(sub_ctx): client_directional_derivative_history = client.get("direction_derivative_history") for seed, history in client_directional_derivative_history.items(): # torch.LongTensor -> int seed = int(seed) if seed not in direction_derivative_history: direction_derivative_history[seed] = [] direction_derivative_history[seed].extend(history) direction_derivative_sum[seed] += sum(history) # step3: evaluate to get stopping condition if necessary if self.should_stop(): break def should_stop(self): return False def evaluate(self): pass class ClientTrainer: def __init__(self, ctx: Context, model, fedkseed_args, training_args, train_dataset, eval_dataset, data_collator, tokenizer): self.ctx = ctx self.fedkseed_args = fedkseed_args self.training_args = training_args self.data_collator = data_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.tokenizer = tokenizer self.weight_decay = training_args.weight_decay self.model_0 = model def train(self): for i, sub_ctx in self.ctx.ctxs_range(self.fedkseed_args.num_aggregations): # step1: wait for the server to send the seed candidates and probabilities or exit signal logger.info(f"training loop started: {i}") should_exit, kwargs = sub_ctx.arbiter.get("train_once") seed_candidates = kwargs["seed_candidates"] seed_probabilities = kwargs["seed_probabilities"] direction_derivative_sum = kwargs["direction_derivative_sum"] logger.info( f"should_exit: {should_exit}, seed_candidates: {seed_candidates}, seed_probabilities: {seed_probabilities}" ) if should_exit: break # step2: start the training loop direction_derivative_history = self.train_once( seed_candidates, seed_probabilities, direction_derivative_sum ) # step3: send the directional derivative history to the server sub_ctx.arbiter.put("direction_derivative_history", direction_derivative_history) def train_once(self, seed_candidates, seed_probabilities, direction_derivative_sum) -> Mapping[int, List[float]]: # build model model = copy.deepcopy(self.model_0) model.to(self.training_args.device) if direction_derivative_sum is not None: param_groups = get_optimizer_parameters_grouped_with_decay(model, self.weight_decay) for seed, grad in direction_derivative_sum.items(): if grad != 0.0: directional_derivative_step( param_groups, seed, grad, lr=self.training_args.learning_rate, weight_decay=self.training_args.weight_decay ) # train trainer = KSeedZOExtendedTrainer( model=model, training_args=self.training_args, kseed_args=self.fedkseed_args, tokenizer=self.tokenizer, data_collator=self.data_collator, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset, ) trainer.configure_seed_candidates(seed_candidates, seed_probabilities) trainer.train() if self.eval_dataset is not None: logger.info(f"evaluate: {trainer.evaluate()}") # get directional derivative history return trainer.get_directional_derivative_history() @dataclass class FedKSeedTrainingArguments(KSeedTrainingArguments): num_aggregations: int = field(default=10, metadata={"help": "The number of aggregations to perform."}) bias_loss_clip: float = field(default=1000.0, metadata={"help": "The bias loss clip value."}) grad_initial: float = field( default=0.0, metadata={"help": "The initial value for the directional derivative history."} ) ================================================ FILE: python/fate_llm/algo/fedkseed/optimizer.py ================================================ """ The implementations of ZerothOrderOptimizer and KSeedZerothOrderOptimizer is adapted from https://github.com/princeton-nlp/MeZO (MIT License) and https://github.com/alibaba/FederatedScope/tree/FedKSeed (Apache License 2.0) Copyright (c) 2021 Princeton Natural Language Processing Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --- # # Copyright 2023 The FederatedScope Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ import math from typing import Mapping, Optional, Callable, Tuple, List import torch from torch.optim import Optimizer from fate_llm.algo.fedkseed.pytorch_utils import get_optimizer_parameters_grouped_with_decay from fate_llm.algo.fedkseed.zo_utils import directional_derivative_step class RandomWalkOptimizer(Optimizer): """ Random Walk Optimizer This optimizer performs a `random` walk update for the parameters of the model. """ def __init__(self, params, lr, weight_decay, grad_clip, defaults=None): self.lr = lr self.weight_decay = weight_decay self.grad_clip = grad_clip if defaults is None: defaults = dict(lr=lr, weight_decay=weight_decay) else: defaults = dict(defaults) defaults.update(lr=lr, weight_decay=weight_decay) super(RandomWalkOptimizer, self).__init__(params, defaults) @classmethod def from_model(cls, model, lr, weight_decay, grad_clip, **kwargs): optimizer_grouped_parameters = get_optimizer_parameters_grouped_with_decay(model, weight_decay) kwargs["lr"] = lr kwargs["weight_decay"] = weight_decay kwargs["grad_clip"] = grad_clip return cls(optimizer_grouped_parameters, **kwargs) def directional_derivative_step( self, directional_derivative_seed: int, directional_derivative_value: torch.FloatTensor ) -> torch.FloatTensor: """ perform a step update for the parameters of the model along the random direction z with the learning rate lr and the step size grad_projected_value """ if self.grad_clip > 0.0: if abs(directional_derivative_value) > self.grad_clip: return torch.FloatTensor([torch.nan]) directional_derivative_step(self.param_groups, directional_derivative_seed, directional_derivative_value) return directional_derivative_value def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: raise NotImplementedError( "use random_step instead of step for RandomWalkOptimizer \ since we need pass the `seed` and `grad_projected_value`" ) class ZerothOrderOptimizer(RandomWalkOptimizer): def __init__(self, params, lr, eps, weight_decay, grad_clip): self.eps = eps defaults = dict(eps=eps) super(ZerothOrderOptimizer, self).__init__(params, lr, weight_decay, grad_clip, defaults) def zeroth_order_step( self, directional_derivative_seed: int, closure: Callable[[], torch.FloatTensor] ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """ perform a step update for the parameters of the model along the random direction z generated by the `directional_derivative_seed` with the learning rate lr and the step size of calculated namely `directional_derivative_value` Input: - directional_derivative_seed: the seed for generating the random direction z - closure (callable, optional): A closure that reevaluates the model and returns the loss. Output: - directional_derivative_value: the gradient projected value - loss_right: the loss of the model with the perturbed parameters x + eps * z - loss_left: the loss of the model with the perturbed parameters x - eps * z """ # x -> x + eps * z self.random_perturb_parameters(directional_derivative_seed, scaling_factor=1.0) loss_right = closure() # x + eps * z -> x - eps * z self.random_perturb_parameters(directional_derivative_seed, scaling_factor=-2.0) loss_left = closure() # x - eps * z -> x self.random_perturb_parameters(directional_derivative_seed, scaling_factor=1.0) if torch.isnan(loss_right): return loss_right, loss_right, loss_left if torch.isnan(loss_left): return loss_left, loss_right, loss_left # ∇f(x) · z = D_z f(x) ≈ (f(x + eps * z) - f(x - eps * z)) / (2 * eps) directional_derivative_value = (loss_right - loss_left) / (2 * self.eps) # perform update for the random direction z * grad_projected_value directional_derivative_value = self.directional_derivative_step( directional_derivative_seed, directional_derivative_value ) return directional_derivative_value, loss_right, loss_left def random_perturb_parameters(self, directional_derivative_seed: int, scaling_factor: float): """ Perturb the parameters with random direction z generated by the directional_derivative_seed for each parameter theta, the update is theta = theta + scaling_factor * z * eps Input: - seed: the seed for generating the random direction z - scaling_factor: the scaling factor for the random direction z Output: - None """ torch.manual_seed(directional_derivative_seed) for param_group in self.param_groups: eps = param_group["eps"] for param in param_group["params"]: if param.requires_grad: z = torch.normal( mean=0, std=1, size=param.data.size(), device=param.data.device, dtype=param.data.dtype ) param.data = param.data + scaling_factor * eps * z class KSeedZerothOrderOptimizer(ZerothOrderOptimizer): def __init__( self, params, seed_candidates: torch.LongTensor, seed_probabilities: torch.FloatTensor, lr, eps, weight_decay, grad_clip, ): self.seed_candidate = seed_candidates self.seed_probabilities = seed_probabilities self.directional_derivative_history: Mapping[int, List[float]] = {seed.item(): [] for seed in seed_candidates} self.sample_random_generator = torch.Generator() super(KSeedZerothOrderOptimizer, self).__init__(params, lr, eps, weight_decay, grad_clip) def sample(self) -> int: sampled = torch.multinomial( input=self.seed_probabilities, num_samples=1, generator=self.sample_random_generator, )[0].item() return self.seed_candidate[sampled].item() def step(self, closure: Callable[[], torch.FloatTensor] = None) -> torch.FloatTensor: if closure is None: # closure is required for the zeroth_order_step, but we # don't raise an error here to maintain compatibility with # the third-party tools that use the `step` method without # providing the closure in training loop, e.g., HuggingFace Transformers return torch.FloatTensor([torch.nan]) return self.kseed_zeroth_order_step(closure) def kseed_zeroth_order_step(self, closure: Callable[[], torch.FloatTensor]) -> torch.FloatTensor: """ Performs a single optimization step. 1. Sample a random seed for sampling z 2. Perturb the parameters with the random direction(-z * eps, z * eps) for evaluating the model on the batch, and compute the loss(loss1, loss2) 3. Compute the directional derivative value: grad_projected_value = (loss_right - loss_left) / (2 * eps) 4. Perform the directional derivative step update for the parameters of the model along the random direction z with the learning rate lr and the step size grad_projected_value Input: - closure (callable, optional): A closure that reevaluates the model and returns the loss. """ if closure is None: raise ValueError("closure must not be None") # sample the random seed for sampling z for perturbing parameters. seed = self.sample() directional_derivative_value, loss_right, loss_left = self.zeroth_order_step(seed, closure) if math.isnan(directional_derivative_value): return directional_derivative_value # record the directional_derivative_value for the seed self.directional_derivative_history[seed].append(directional_derivative_value.item()) return loss_right # TODO: return loss_left or loss_right or average of both? ================================================ FILE: python/fate_llm/algo/fedkseed/pytorch_utils.py ================================================ from typing import List from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS from transformers.trainer_pt_utils import get_parameter_names def get_decay_parameter_names(model) -> List[str]: """ Get all parameter names that weight decay will be applied to Note that some models implement their own layernorm instead of calling nn.LayerNorm, weight decay could still apply to those modules since this function only filter out instance of nn.LayerNorm NOTE: This function is copied from transformers # Copyright 2020-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS) decay_parameters = [name for name in decay_parameters if "bias" not in name] return decay_parameters def get_optimizer_parameters_grouped_with_decay(model, weight_decay: float) -> List[dict]: """ Get the parameters grouped by whether they should have weight decay applied """ decay_parameters = get_decay_parameter_names(model) params_no_decay = [] params_decay = [] for n, p in model.named_parameters(): if p.requires_grad: if n in decay_parameters: params_decay.append(p) else: params_no_decay.append(p) grouped_parameters_with_decay = [ {"params": params_no_decay, "weight_decay": 0.0}, {"params": params_decay, "weight_decay": weight_decay}, ] return grouped_parameters_with_decay ================================================ FILE: python/fate_llm/algo/fedkseed/trainer.py ================================================ import logging from typing import Dict, Union, Any, Tuple from typing import Optional, List, Callable import torch from torch import nn from torch.utils.data import Dataset from transformers import PreTrainedModel, PreTrainedTokenizerBase, EvalPrediction, DataCollator from transformers import Trainer, TrainingArguments from transformers.optimization import get_scheduler, SchedulerType from transformers.trainer_callback import TrainerCallback from fate_llm.algo.fedkseed.args import KSeedTrainingArguments from fate_llm.algo.fedkseed.optimizer import KSeedZerothOrderOptimizer from fate_llm.algo.fedkseed.pytorch_utils import get_optimizer_parameters_grouped_with_decay logger = logging.getLogger(__name__) class KSeedZOExtendedTrainer(Trainer): def __init__( self, model: Union[PreTrainedModel, nn.Module] = None, training_args: TrainingArguments = None, kseed_args: "KSeedTrainingArguments" = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, ): super().__init__( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, model_init=model_init, compute_metrics=compute_metrics, callbacks=callbacks, optimizers=optimizers, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) self.kseed_args = kseed_args self._kseed_optimizer = None self._seed_candidates = None self._seed_probabilities = None def configure_seed_candidates(self, seed_candidates: torch.LongTensor, seed_probabilities: torch.FloatTensor): self._seed_candidates = seed_candidates self._seed_probabilities = seed_probabilities def get_directional_derivative_history(self): """ hook to get the directional derivative history """ if KSeedZOExtendedTrainer.k_seed_zo_mode(self.kseed_args): if self._kseed_optimizer is None: raise ValueError("KSeedZerothOrderOptimizer is not configured") return self._kseed_optimizer.directional_derivative_history else: raise ValueError("KSeedZerothOrderOptimizer is not configured") @staticmethod def k_seed_zo_mode(args): return hasattr(args, "zo_optim") and args.zo_optim def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: """ hook to do the step with KSeedZerothOrderOptimizer """ if KSeedZOExtendedTrainer.k_seed_zo_mode(self.kseed_args): if self._kseed_optimizer is None: raise ValueError("KSeedZerothOrderOptimizer is not configured") model.eval() inputs = self._prepare_inputs(inputs) with self.compute_loss_context_manager(): # zeroth order optimization needs forward pass twice in an optimization step, # so we need to wrap the forward pass in a closure def closure() -> torch.FloatTensor: with torch.no_grad(): return self.compute_loss(model, inputs, return_outputs=False).detach() # we don't use step() method of KSeedZerothOrderOptimizer here # because `Trainer` wraps the optimizer that is subclass of `torch.optim.Optimizer` and # returns nothing from the step method with torch.no_grad(): loss = self._kseed_optimizer.kseed_zeroth_order_step(closure=closure) return loss.detach() else: return super().training_step(model, inputs) def create_optimizer_and_scheduler(self, num_training_steps: int): """ hook to add KSeedZerothOrderOptimizer """ if KSeedZOExtendedTrainer.k_seed_zo_mode(self.kseed_args): if self._seed_candidates is None or self._seed_probabilities is None: raise ValueError("Seed candidates and probabilities are not configured.") optimizer_grouped_parameters = get_optimizer_parameters_grouped_with_decay( self.model, self.args.weight_decay ) self.optimizer = KSeedZerothOrderOptimizer( optimizer_grouped_parameters, seed_candidates=self._seed_candidates, seed_probabilities=self._seed_probabilities, lr=self.args.learning_rate, eps=self.kseed_args.eps, weight_decay=self.args.weight_decay, grad_clip=self.kseed_args.grad_clip, ) # we need to keep the reference to the original optimizer to use it in training_step self._kseed_optimizer = self.optimizer # if we use learning rate scheduler, we may need to preserve all updates instead of the aggregated one self.lr_scheduler = get_scheduler( name=SchedulerType.CONSTANT, optimizer=self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps, ) else: super().create_optimizer_and_scheduler(num_training_steps) ================================================ FILE: python/fate_llm/algo/fedkseed/zo_utils.py ================================================ from typing import List import torch def probability_from_amps(amps: List[List[float]], clip): """ Get the probability distribution from the amplitude history formula: amp_i = clamp(amp_i, -clip, clip).abs().mean() amp_i = (amp_i - min(amp)) / (max(amp) - min(amp)) prob_i = softmax(amp)_i :param amps: list of amplitude history :param clip: the clipping value :return: """ amps = [torch.Tensor(amp) for amp in amps] amp = torch.stack([amp.clamp_(-clip, clip).abs_().mean() for amp in amps]) return (amp - amp.min()).div_(amp.max() - amp.min() + 1e-10).softmax(0) def directional_derivative_step( param_groups: List[dict], directional_derivative_seed: int, directional_derivative_value: torch.FloatTensor, lr: float = None, weight_decay: float = None, ) -> torch.FloatTensor: """ perform a step update for the parameters of the model along the random direction z with the learning rate lr and the step size grad_projected_value Input: - param_groups (List[dict]): list of parameter groups - directional_derivative_seed (int): seed for the random direction - directional_derivative_value (torch.FloatTensor): the step size - lr (float, optional): learning rate - weight_decay (float, optional): weight decay """ torch.manual_seed(directional_derivative_seed) for param_group in param_groups: weight_decay = param_group["weight_decay"] if weight_decay is None else weight_decay lr = param_group["lr"] if lr is None else lr for param in param_group["params"]: z = torch.normal(mean=0, std=1, size=param.data.size(), device=param.data.device, dtype=param.data.dtype) if weight_decay is not None: param.data = param.data - lr * (directional_derivative_value * z + weight_decay * param.data) else: param.data = param.data - lr * (directional_derivative_value * z) return directional_derivative_value def build_seed_candidates(k, low=0, high=2**32): """ Build seed candidates for the random walk optimizer """ return torch.randint(low, high, size=(k,), dtype=torch.long) def get_even_seed_probabilities(k): """ Get the even seed probabilities, i.e., 1/k for each seed """ return torch.ones(k) / k ================================================ FILE: python/fate_llm/algo/fedmkt/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.algo.fedmkt.fedmkt import ( FedMKTTrainingArguments, FedMKTSLM, FedMKTLLM ) __all__ = [ "FedMKTSLM", "FedMKTLLM", "FedMKTTrainingArguments" ] ================================================ FILE: python/fate_llm/algo/fedmkt/fedmkt.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch import logging import datasets from dataclasses import dataclass, field import transformers from ...trainer.seq2seq_trainer import Seq2SeqTrainingArguments from typing import Dict, Optional, List, Callable, Union from fate.arch import Context from fate.ml.nn.trainer.trainer_base import FedArguments from torch.utils.data import Dataset from transformers.trainer_callback import TrainerCallback from transformers import PreTrainedTokenizer from transformers import Seq2SeqTrainer from transformers.trainer_utils import EvalPrediction from transformers.modeling_utils import PreTrainedModel from transformers.modeling_utils import unwrap_model from fate_llm.algo.fedmkt.token_alignment.token_align import token_align from fate_llm.algo.fedmkt.utils.generate_logit_utils import generate_pub_data_logits from fate.ml.aggregator import AggregatorClientWrapper, AggregatorServerWrapper from fate_llm.algo.fedmkt.fedmkt_trainer import FedMKTTrainer from fate_llm.algo.fedmkt.fedmkt_data_collator import DataCollatorForFedMKT from fate_llm.algo.fedmkt.utils.dataset_sync_util import sync_dataset logger = logging.getLogger(__name__) @dataclass class FedMKTTrainingArguments(Seq2SeqTrainingArguments): """ selection metric type """ metric_type: str = field(default="ce") """ top-k logits select params """ top_k_logits_keep: int = field(default=128) top_k_strategy: str = field(default="highest") """ distillation params """ distill_loss_type: str = field(default="ce") kd_alpha: float = field(default=0.9) distill_temperature: float = field(default=1.0) server_public_data_local_epoch: int = field(default=1) client_public_data_local_epoch: int = field(default=1) client_priv_data_local_epoch: int = field(default=1) distill_strategy: str = field(default="greater") global_epochs: int = field(default=1) """ token-alignment params """ skip_align: bool = field(default=False) token_align_strategy: str = field(default="dtw") vocab_mapping_paths: Union[str, List[str]] = field(default=None) vocab_size: int = field(default=None) """ homo training params """ post_fedavg: bool = field(default=False) """ slm training only """ llm_training: bool = field(default=True) def to_dict(self): from dataclasses import fields from enum import Enum d = {field.name: getattr(self, field.name) for field in fields(self) if field.init} for k, v in d.items(): if isinstance(v, Enum): d[k] = v.value if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum): d[k] = [x.value for x in v] if k.endswith("_token"): d[k] = f"<{k.upper()}>" return d def to_dict_without_extra_args(self): args_dict = self.to_dict() args_dict.pop("metric_type") args_dict.pop("top_k_logits_keep") args_dict.pop("top_k_strategy") args_dict.pop("distill_loss_type") args_dict.pop("kd_alpha") args_dict.pop("distill_temperature") args_dict.pop("distill_strategy") args_dict.pop("server_public_data_local_epoch") args_dict.pop("client_public_data_local_epoch") args_dict.pop("client_priv_data_local_epoch") args_dict.pop("global_epochs") args_dict.pop("skip_align", False) args_dict.pop("token_align_strategy") args_dict.pop("vocab_mapping_paths", None) args_dict.pop("vocab_size", None) args_dict.pop("post_fedavg") args_dict.pop("llm_training", True) return args_dict def to_dict_with_client_priv_training_args(self): args_dict = self.to_dict_without_extra_args() args_dict["num_train_epochs"] = self.client_priv_data_local_epoch return args_dict def to_dict_with_client_kd_args(self): args_dict = self.to_dict_without_extra_args() args_dict["num_train_epochs"] = self.client_public_data_local_epoch return args_dict def to_dict_with_server_kd_args(self): args_dict = self.to_dict_without_extra_args() args_dict["num_train_epochs"] = self.server_public_data_local_epoch return args_dict class FedMKTBase(object): def __init__(self, *args, **kwargs): self.model = None self.save_trainable_weights_only = None def save_model( self, output_dir: Optional[str] = None, state_dict=None ): if not self.save_trainable_weights_only: torch.save(self.model.state_dict(), output_dir + '/pytorch_model.bin') else: model = unwrap_model(self.model) if hasattr(model, "save_trainable"): model.save_trainable(output_dir) else: state_dict = { k: p.to("cpu") for k, p in model.named_parameters() if p.requires_grad } torch.save(state_dict, output_dir + '/pytorch_model.bin') class FedMKTSLM(FedMKTBase): def __init__( self, ctx: Context, model: torch.nn.Module, training_args: FedMKTTrainingArguments, fed_args: FedArguments = None, priv_train_set=None, pub_train_set=None, val_set: Dataset = None, priv_optimizer: torch.optim.Optimizer = None, pub_optimizer: torch.optim.Optimizer = None, priv_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, pub_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, data_collator: Callable = None, tokenizer: Optional[PreTrainedTokenizer] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = [], save_trainable_weights_only: bool = False, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, llm_tokenizer=None, llm_to_slm_vocab_mapping=None, ): super(FedMKTSLM, self).__init__() self.ctx = ctx self.training_args = training_args self.fed_args = fed_args self.model = model self.tokenizer = tokenizer self.model_init = model_init self.callbacks = callbacks self.compute_metrics = compute_metrics self.save_trainable_weights_only = save_trainable_weights_only self.preprocess_logits_for_metrics = preprocess_logits_for_metrics self.priv_data_collator = data_collator self.priv_optimizer = priv_optimizer self.pub_optimizer = pub_optimizer self.priv_scheduler = priv_scheduler self.pub_scheduler = pub_scheduler self.priv_train_set = priv_train_set self.pub_train_set = pub_train_set self.llm_tokenizer = llm_tokenizer self.llm_to_slm_vocab_mapping = llm_to_slm_vocab_mapping self.val_set = val_set self.aggregator = self._init_aggregator(ctx, fed_args) if not isinstance(self.pub_train_set, datasets.Dataset): self.pub_train_set = datasets.Dataset.from_list(list(self.pub_train_set)) def train(self): global_epochs = self.training_args.global_epochs llm_pub_logits = None for i, iter_ctx in self.ctx.on_iterations.ctxs_range(global_epochs): logger.info(f"begin {i}-th global kd process") priv_data_training_args = self._get_priv_data_training_args() priv_trainer = Seq2SeqTrainer( model=self.model, tokenizer=self.tokenizer, data_collator=self.priv_data_collator, train_dataset=self.priv_train_set, args=priv_data_training_args, model_init=self.model_init if not i else None, compute_metrics=self.compute_metrics, callbacks=self.callbacks, optimizers=(self.priv_optimizer, self.priv_scheduler), preprocess_logits_for_metrics=self.preprocess_logits_for_metrics ) logger.info(f"begin {i}-th private data training process") priv_trainer.train() self.model = unwrap_model(priv_trainer.model) logger.info(f"begin {i}-th public logits generation process") if self.training_args.world_size <= 1 or self.training_args.local_rank == 0: slm_pub_logits = self.pub_train_set.map( generate_pub_data_logits, batched=True, batch_size=self.training_args.per_device_train_batch_size, num_proc=None, load_from_cache_file=True, fn_kwargs={"model": self.model, "training_args": self.training_args, "data_collator": transformers.DataCollatorForSeq2Seq(self.tokenizer)} ) if self.training_args.world_size > 1: logger.info("sync slm_pub_logits") sync_dataset( slm_pub_logits, self.training_args.local_rank, self.training_args.world_size, self.training_args.device ) if self.training_args.llm_training: logger.debug(f"send {i}-th public logits to llm") iter_ctx.arbiter.put("slm_pub_logits", slm_pub_logits.to_dict()) if self.training_args.llm_training or not i: llm_pub_logits = datasets.Dataset.from_dict(iter_ctx.arbiter.get("llm_pub_logits")) if self.training_args.world_size > 1: logger.info("sync llm_pub_logits") sync_dataset(llm_pub_logits, self.training_args.local_rank, self.training_args.world_size, self.training_args.device) else: slm_pub_logits = sync_dataset( None, self.training_args.local_rank, self.training_args.world_size, self.training_args.device ) if self.training_args.llm_training or not i: llm_pub_logits = sync_dataset(None, self.training_args.local_rank, self.training_args.world_size, self.training_args.device) logger.info(f"begin {i}-th token alignment process") aligned_dataset = token_align( base_model_logits_datasets=slm_pub_logits, blending_model_logits_dataset=llm_pub_logits, base_tokenizer=self.tokenizer, blending_tokenizer=self.llm_tokenizer, blending_to_base_mapping=self.llm_to_slm_vocab_mapping, blending_model_index=0, skip_align=self.training_args.skip_align, align_strategy=self.training_args.token_align_strategy ) logger.info(f"begin {i}-th public logits kd process") fedmkt_trainer = self._init_trainer_for_distill(aligned_dataset) fedmkt_trainer.train() self.model = unwrap_model(fedmkt_trainer.model) if self.training_args.post_fedavg and (i + 1) % self.fed_args.aggregate_freq == 0: self.aggregator.model_aggregation(iter_ctx, self.model) def _init_trainer_for_distill(self, train_set): public_data_training_args = self._get_pub_data_kd_training_args() fedmkt_trainer = FedMKTTrainer( model=self.model, tokenizer=self.tokenizer, args=public_data_training_args, train_dataset=train_set, eval_dataset=self.val_set, data_collator=DataCollatorForFedMKT( self.tokenizer, padding="max_length", max_length=max(len(d["input_ids"]) for d in train_set), blending_num=1, vocab_size=self.training_args.vocab_size, dtype=next(self.model.parameters()).dtype, distill_temperature=self.training_args.distill_temperature ), blending_num=1, lm_loss_weight=self.training_args.kd_alpha, distill_loss_type=self.training_args.distill_loss_type, distill_strategy=self.training_args.distill_strategy ) return fedmkt_trainer def _get_priv_data_training_args(self): pre_args = self.training_args.to_dict_with_client_priv_training_args() post_args = Seq2SeqTrainingArguments(**pre_args) return post_args def _get_pub_data_kd_training_args(self): pre_args = self.training_args.to_dict_with_client_kd_args() post_args = Seq2SeqTrainingArguments(**pre_args) return post_args def _init_aggregator(self, ctx: Context, fed_args: FedArguments): if not self.training_args.post_fedavg: return None aggregate_type = "weighted_mean" aggregator_name = "fedavg" aggregator = fed_args.aggregator return AggregatorClientWrapper( ctx, aggregate_type, aggregator_name, aggregator, sample_num=len(self.pub_train_set), args=self.training_args ) class FedMKTLLM(FedMKTBase): def __init__( self, ctx: Context, model: torch.nn.Module, training_args: FedMKTTrainingArguments, fed_args: FedArguments = None, train_set=None, val_set: Dataset = None, optimizer: torch.optim.Optimizer = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, data_collator: Callable = None, tokenizer: Optional[PreTrainedTokenizer] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = [], save_trainable_weights_only: bool = False, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, slm_tokenizers: List = None, slm_to_llm_vocab_mappings: List[Dict] = None, ): super(FedMKTLLM, self).__init__() self.ctx = ctx self.model = model self.training_args = training_args self.fed_args = fed_args self.train_set = train_set self.val_set = val_set self.optimizer = optimizer self.lr_scheduler = scheduler self.data_collator = data_collator self.tokenizer = tokenizer self.model_init = model_init self.compute_metrics = compute_metrics self.callbacks = callbacks self.save_trainable_weights_only = save_trainable_weights_only self.preprocess_logits_for_metrics = preprocess_logits_for_metrics self.slm_tokenizers = slm_tokenizers self.slm_to_llm_vocab_mappings = slm_to_llm_vocab_mappings self.aggregator = self._init_aggregator(ctx) if not isinstance(self.train_set, datasets.Dataset): self.train_set = datasets.Dataset.from_list(list(self.train_set)) def _init_aggregator(self, ctx: Context): if not self.training_args.post_fedavg: return None return AggregatorServerWrapper(ctx) def generate_pub_data_logits(self, first_epoch=False): fn_kwargs = {"model": self.model, "training_args": self.training_args, "data_collator": transformers.DataCollatorForSeq2Seq(self.tokenizer)} if first_epoch and self.training_args.device.type == "cuda": self.model.cuda(self.training_args.device) return self.train_set.map( generate_pub_data_logits, batched=True, batch_size=self.training_args.per_device_train_batch_size, num_proc=None, load_from_cache_file=True, fn_kwargs=fn_kwargs ) def on_epoch_begin(self, iter_ctx, epoch_idx, previous_pub_dataset): logger.info(f"on {epoch_idx}-epoch begin") if not self.training_args.llm_training: return if previous_pub_dataset is None: if self.training_args.world_size <= 1 or self.training_args.local_rank == 0: llm_pub_logits = self.generate_pub_data_logits(first_epoch=True if not epoch_idx else False) if self.training_args.world_size > 1: sync_dataset(llm_pub_logits, self.training_args.local_rank, self.training_args.world_size, self.training_args.device) else: llm_pub_logits = sync_dataset(None, self.training_args.local_rank, self.training_args.world_size, self.training_args.device) else: llm_pub_logits = previous_pub_dataset slm_pub_logits_list = list() if self.training_args.world_size <= 1 or self.training_args.local_rank == 0: slm_pub_logits_list.append(datasets.Dataset.from_dict(iter_ctx.guest.get('slm_pub_logits'))) if any(p.role == 'host' for p in self.ctx.parties): slm_pub_logits_list.extend( datasets.Dataset.from_dict(client_logits) for client_logits in iter_ctx.hosts.get("slm_pub_logits") ) if self.training_args.world_size > 1: logger.info("sync dataset to other rank") for slm_pub_logits in slm_pub_logits_list: sync_dataset(slm_pub_logits, self.training_args.local_rank, self.training_args.world_size, self.training_args.device) logger.info("end to sync") else: logger.info("sync dataset from rank 0") for _ in range(len(self.slm_tokenizers)): slm_pub_logits_list.append( sync_dataset(None, self.training_args.local_rank, self.training_args.world_size, self.training_args.device) ) logger.info("end to sync dataset from rank 0") aligned_dataset = llm_pub_logits for idx, slm_pub_logits in enumerate(slm_pub_logits_list): aligned_dataset = token_align( base_model_logits_datasets=aligned_dataset, blending_model_logits_dataset=slm_pub_logits, base_tokenizer=self.tokenizer, blending_tokenizer=self.slm_tokenizers[idx], blending_to_base_mapping=self.slm_to_llm_vocab_mappings[idx], blending_model_index=idx, skip_align=self.training_args.skip_align, align_strategy=self.training_args.token_align_strategy ) return aligned_dataset def on_epoch_end(self, iter_ctx, epoch_idx): logger.info(f"on {epoch_idx}-epoch end") if not self.training_args.llm_training and epoch_idx > 1: return llm_pub_logits = self.generate_pub_data_logits(first_epoch=True if not self.training_args.llm_training else False) if self.training_args.world_size <= 1 or self.training_args.local_rank == 0: iter_ctx.guest.put("llm_pub_logits", llm_pub_logits.to_dict()) if len(self.slm_tokenizers) > 1: iter_ctx.hosts.put("llm_pub_logits", llm_pub_logits.to_dict()) if self.training_args.post_fedavg and (epoch_idx + 1) % self.fed_args.aggregate_freq == 0: self.aggregator.model_aggregation(iter_ctx) if self.training_args.world_size > 1: sync_dataset( llm_pub_logits, self.training_args.local_rank, self.training_args.world_size, self.training_args.device ) else: llm_pub_logits = sync_dataset( None, self.training_args.local_rank, self.training_args.world_size, self.training_args.device ) return llm_pub_logits def _get_pub_data_kd_training_args(self): pre_args = self.training_args.to_dict_with_server_kd_args() post_args = Seq2SeqTrainingArguments(**pre_args) return post_args def train(self): global_epochs = self.training_args.global_epochs previous_pub_logits = None for i, iter_ctx in self.ctx.on_iterations.ctxs_range(global_epochs): logger.info(f"begin {i}-th global kd process") aligend_train_set = self.on_epoch_begin(iter_ctx, i, previous_pub_logits) if self.training_args.llm_training: public_data_training_args = self._get_pub_data_kd_training_args() fedmkt_trainer = FedMKTTrainer( model=self.model, tokenizer=self.tokenizer, args=public_data_training_args, train_dataset=aligend_train_set, eval_dataset=self.val_set, data_collator=DataCollatorForFedMKT( self.tokenizer, padding="max_length", max_length=max(len(d["input_ids"]) for d in aligend_train_set), blending_num=len(self.slm_tokenizers), vocab_size=self.training_args.vocab_size, dtype=next(self.model.parameters()).dtype, distill_temperature=self.training_args.distill_temperature ), blending_num=len(self.slm_tokenizers), lm_loss_weight=self.training_args.kd_alpha, distill_loss_type=self.training_args.distill_loss_type, distill_strategy=self.training_args.distill_strategy ) fedmkt_trainer.train() self.model = unwrap_model(fedmkt_trainer.model) previous_pub_logits = self.on_epoch_end(iter_ctx, i) ================================================ FILE: python/fate_llm/algo/fedmkt/fedmkt_data_collator.py ================================================ # # NOTE: The implementations of DataCollatorForFedMKT is modified from FuseAI/FuseLLM # Copyright FuseAI/FuseLLM # # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch from torch.nn.functional import softmax from transformers import DataCollatorForSeq2Seq from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.utils import PaddingStrategy from typing import Optional, Any, Union import logging from fate_llm.algo.fedmkt.utils.vars_define import ( ALIGNED_OTHER_LOGITS, ALIGNED_OTHER_INDICES, PER_STEP_LOGITS, PER_STEP_INDICES, SELF_TARGET_DIST, OTHER_TARGET_DIST ) logger = logging.getLogger(__name__) class DataCollatorForFedMKT(DataCollatorForSeq2Seq): """modified from https://github.com/fanqiwan/FuseAI/blob/main/FuseLLM/src/utils/data_collator.py#L135""" tokenizer: PreTrainedTokenizerBase model: Optional[Any] = None padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None label_pad_token_id: int = -100 return_tensors: str = "pt" blending_num: int = 1 distill_temperature: float = 1.0 vocab_size: int = None dtype: torch.dtype = torch.bfloat16 def __init__(self, *args, **kwargs): blending_num = kwargs.pop("blending_num", 4) vocab_size = kwargs.pop("vocab_size", None) dtype = kwargs.pop("dtype", torch.dtype) distill_temperature = kwargs.pop("distill_temperature", 1.0) super(DataCollatorForFedMKT, self).__init__(*args, **kwargs) self.blending_num = blending_num self.vocab_size = vocab_size if vocab_size is not None else len(self.tokenizer.get_vocab()) self.pad_id = self.tokenizer.pad_token_id self.dtype = dtype self.distill_temperature = distill_temperature def __call__(self, features, return_tensors=None): extra_features = dict() feature_keys = list(features[0].keys()) for f_key in feature_keys: if f_key not in ["input_ids", "attention_mask", "labels"]: extra_features[f_key] = [] for feature in features: extra_features[f_key].append(feature.pop(f_key)) features = super().__call__(features=features, return_tensors=return_tensors) features.update(extra_features) batch_size = features["input_ids"].size(0) base_target_dist = torch.zeros(batch_size, self.max_length, self.vocab_size).to(self.dtype) aligned_target_dists = [torch.zeros(batch_size, self.max_length, self.vocab_size).to(self.dtype) for _ in range(self.blending_num)] for i in range(batch_size): base_seq_len = len(features[PER_STEP_LOGITS][i]) for j in range(self.max_length): if j < base_seq_len: base_logits = torch.tensor(features[PER_STEP_LOGITS][i][j], dtype=self.dtype) base_prob = softmax(base_logits / self.distill_temperature, -1) base_indices = torch.tensor(features[PER_STEP_INDICES][i][j]) base_target_dist[i][j] = base_target_dist[i][j].scatter_(-1, base_indices, base_prob) for k in range(self.blending_num): per_step_aligned_indices_key = f"{ALIGNED_OTHER_INDICES}_{k}" per_step_aligned_logits_key = f"{ALIGNED_OTHER_LOGITS}_{k}" if len(features[per_step_aligned_indices_key][i][j]) > 0: aligned_logits = torch.tensor(features[per_step_aligned_logits_key][i][j], dtype=self.dtype) aligned_prob = softmax(aligned_logits / self.distill_temperature, -1) aligned_indices = torch.tensor(features[per_step_aligned_indices_key][i][j]) aligned_target_dists[k][i][j] = aligned_target_dists[k][i][j].scatter_(-1, aligned_indices, aligned_prob) else: aligned_target_dists[k][i][j] = base_target_dist[i][j] else: # padding position base_target_dist[i][j][self.pad_id] = 1.0 for k in range(self.blending_num): aligned_target_dists[k][i][j][self.pad_id] = 1.0 features.pop(PER_STEP_LOGITS) features.pop(PER_STEP_INDICES) for i in range(self.blending_num): features.pop(f"{ALIGNED_OTHER_LOGITS}_{i}") features.pop(f"{ALIGNED_OTHER_INDICES}_{i}") features[f"{OTHER_TARGET_DIST}_{i}"] = aligned_target_dists[i] features[SELF_TARGET_DIST] = base_target_dist return features ================================================ FILE: python/fate_llm/algo/fedmkt/fedmkt_trainer.py ================================================ # # NOTE: The implementations of FedMKTTrainer is modified from FuseAI/FuseLLM # Copyright FuseAI # # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import torch from torch.nn.functional import kl_div, log_softmax, cross_entropy from transformers import Seq2SeqTrainer from transformers.modeling_utils import unwrap_model from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from fate_llm.algo.fedmkt.utils.vars_define import ( SELF_TARGET_DIST, OTHER_TARGET_DIST, ALIGNED_OTHER_METRIC, METRIC, ) logger = logging.getLogger(__name__) class FedMKTTrainer(Seq2SeqTrainer): """ modified from https://github.com/fanqiwan/FuseAI/blob/main/FuseLLM/src/utils/trainer.py#L22 """ blending_num: int = 2 distill_loss_type: str = "ce" lm_loss_weight: float = 0.9 distill_strategy = "greater" def __init__(self, *args, **kwargs): blending_num = kwargs.pop("blending_num", 1) distill_loss_type = kwargs.pop("distill_loss_type", "ce") lm_loss_weight = kwargs.pop("lm_loss_weight", 0.9) distill_strategy = kwargs.pop("distill_strategy", "greater") super(FedMKTTrainer, self).__init__(*args, **kwargs) self.blending_num = blending_num self.distill_loss_type = distill_loss_type self.lm_loss_weight = lm_loss_weight self.distill_strategy = distill_strategy def compute_loss(self, model, inputs, return_outputs=False): if self.label_smoother is not None and "labels" in inputs: labels = inputs.pop("labels") else: labels = None base_target_dist = inputs.pop(SELF_TARGET_DIST) base_metric = inputs.pop(METRIC) aligned_target_dists = [] aligned_metrics = [] for i in range(self.blending_num): aligned_target_dists.append(inputs.pop(f"{OTHER_TARGET_DIST}_{i}")) aligned_metrics.append(inputs.pop(f"{ALIGNED_OTHER_METRIC}_{i}")) outputs = model(**inputs) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. if self.args.past_index >= 0: self._past = outputs[self.args.past_index] if labels is not None: if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): loss = self.label_smoother(outputs, labels, shift_labels=True) else: loss = self.label_smoother(outputs, labels) else: if isinstance(outputs, dict) and "loss" not in outputs: raise ValueError( "The model did not return a loss from the inputs, only the following keys: " f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." ) # We don't use .loss here since the model may return tuples instead of ModelOutput. loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] batch_size, seq_len, vocab_size = outputs["logits"].size(0), outputs["logits"].size(1), outputs["logits"].size(2) aligned_rewards = [] for i in range(self.blending_num): aligned_rewards.append((1 / torch.exp(torch.tensor(aligned_metrics[i], dtype=torch.bfloat16))).to(loss.device)) base_reward = (1 / torch.exp(torch.tensor(base_metric, dtype=torch.bfloat16))).to(loss.device) if self.distill_strategy == "greater": base_reward_expanded = base_reward.unsqueeze(-1).unsqueeze(-1).expand_as(base_target_dist) aligned_rewards_expanded = [ aligned_rewards[i].unsqueeze(-1).unsqueeze(-1).expand_as(aligned_target_dists[i]) for i in range(self.blending_num) ] target_dist_list = [] reward_list = [] if base_target_dist is not None: target_dist_list.append(base_target_dist) reward_list.append(base_reward_expanded) target_dist_list.extend(aligned_target_dists) reward_list.extend(aligned_rewards_expanded) stacked_dists = torch.stack(target_dist_list, dim=-1) stacked_rewards = torch.stack(reward_list, dim=-1) max_reward_indices = torch.argmax(stacked_rewards, dim=-1, keepdim=True) target_dist = torch.gather(stacked_dists, -1, max_reward_indices).squeeze(-1) elif self.distill_strategy == "weighted_mean": weights = torch.stack( [base_reward] + aligned_rewards, dim=1 ) normalized_weights = torch.softmax(weights, dim=1) weight_labels = normalized_weights[:, 0].unsqueeze(1).unsqueeze(2) * base_target_dist for i in range(self.blending_num): weight_labels += normalized_weights[:, i + 1].unsqueeze(1).unsqueeze(2) * aligned_target_dists[i] target_dist = ( weight_labels ) else: raise ValueError(f"distill_strategy={self.distill_strategy}") if self.distill_loss_type == "ce": loss_lm = cross_entropy( input=outputs["logits"].view(-1, vocab_size), target=target_dist.view(-1, vocab_size), reduction="none", ).view(batch_size, -1) elif self.distill_loss_type == "kl": loss_lm = kl_div( input=log_softmax(outputs["logits"], dim=-1), target=target_dist, log_target=False, reduction="none").sum(dim=-1) else: raise ValueError(f"Not implement distill_loss_type={self.distill_loss_type}") loss_lm = (loss_lm * inputs["attention_mask"]).sum() / inputs["attention_mask"].sum() loss = self.lm_loss_weight * loss + (1.0 - self.lm_loss_weight) * loss_lm return (loss, outputs) if return_outputs else loss ================================================ FILE: python/fate_llm/algo/fedmkt/token_alignment/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/algo/fedmkt/token_alignment/spectal_token_mapping.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import transformers TOKENIZER_TO_SPECIAL_TOKEN = { transformers.LlamaTokenizer: '▁', transformers.LlamaTokenizerFast: '▁', transformers.GPTNeoXTokenizerFast: 'Ġ', transformers.GPT2TokenizerFast: 'Ġ', transformers.GPT2Tokenizer: 'Ġ', transformers.BloomTokenizerFast: 'Ġ', } ================================================ FILE: python/fate_llm/algo/fedmkt/token_alignment/token_align.py ================================================ # # NOTE: The dtw function is copied from FuseAI/FuseLLM # and the align_blending_model_logits_with_base_model_logits function is modified from FuseAI/FuseLLM # Copyright FuseAI # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import transformers import editdistance import numpy as np from typing import Dict, List from fate_llm.algo.fedmkt.token_alignment.spectal_token_mapping import TOKENIZER_TO_SPECIAL_TOKEN from fate_llm.algo.fedmkt.utils.vars_define import ( PER_STEP_LOGITS, PER_STEP_INDICES, ALIGNED_OTHER_LOGITS, ALIGNED_OTHER_INDICES, ALIGNED_OTHER_METRIC, METRIC ) logger = logging.getLogger(__name__) def dtw(series_1, series_2, norm_func=np.linalg.norm): """code refer to: https://github.com/fanqiwan/FuseAI/blob/main/FuseLLM/src/utils/others.py#L318""" matrix = np.zeros((len(series_1) + 1, len(series_2) + 1)) matrix[0, :] = np.inf matrix[:, 0] = np.inf matrix[0, 0] = 0 for i, vec1 in enumerate(series_1): for j, vec2 in enumerate(series_2): cost = norm_func(vec1, vec2) matrix[i + 1, j + 1] = cost + min(matrix[i, j + 1], matrix[i + 1, j], matrix[i, j]) matrix = matrix[1:, 1:] i = matrix.shape[0] - 1 j = matrix.shape[1] - 1 matches = [] mappings_series_1 = [list() for v in range(matrix.shape[0])] mappings_series_2 = [list() for v in range(matrix.shape[1])] while i > 0 or j > 0: matches.append((i, j)) mappings_series_1[i].append(j) mappings_series_2[j].append(i) option_diag = matrix[i - 1, j - 1] if i > 0 and j > 0 else np.inf option_up = matrix[i - 1, j] if i > 0 else np.inf option_left = matrix[i, j - 1] if j > 0 else np.inf move = np.argmin([option_diag, option_up, option_left]) if move == 0: i -= 1 j -= 1 elif move == 1: i -= 1 else: j -= 1 matches.append((0, 0)) mappings_series_1[0].append(0) mappings_series_2[0].append(0) matches.reverse() for mp in mappings_series_1: mp.reverse() for mp in mappings_series_2: mp.reverse() return matches, matrix[-1, -1], mappings_series_1, mappings_series_2, matrix def greedy_dynamic_matching(base_model_tokens, blending_model_tokens, base_model_sp_t, blending_model_sp_t): l1 = len(base_model_tokens) l2 = len(blending_model_tokens) base_model_tokens = [token.replace(base_model_sp_t, "") for token in base_model_tokens] blending_model_tokens = [token.replace(blending_model_sp_t, "") for token in blending_model_tokens] dp = np.full((l1 + 1, l2 + 1), -1000000000, dtype="int32") matched_left = np.full((l1, l2), -1, dtype="int32") matched_right = np.full((l1, l2), -1, dtype="int32") trans_left = np.full((l1 + 1, l2 + 1), -1, dtype="int32") trans_right = np.full((l1 + 1, l2 + 1), -1, dtype="int32") # this can be optimizer use suffix data structure, but naive implemented for fast trial , it will be optimize later. for i in range(l1): for j in range(l2): if base_model_tokens[i] == blending_model_tokens[j]: matched_left[i][j] = 1 matched_right[i][j] = 1 continue i2, j2 = i, j t1 = "" t2 = "" sq_l1, sq_l2 = 0, 0 while i2 >= 0 and j2 >= 0: if len(t1) > len(t2): t2 = blending_model_tokens[j2] + t2 sq_l2 += 1 j2 -= 1 elif len(t1) < len(t2): t1 = base_model_tokens[i2] + t1 sq_l1 += 1 i2 -= 1 else: if sq_l1 == 0: sq_l1 += 1 sq_l2 += 1 t1 += base_model_tokens[i2] t2 += blending_model_tokens[j2] i2 -= 1 j2 -= 1 continue if t1 == t2: matched_left[i][j] = sq_l1 matched_right[i][j] = sq_l2 break """ always shortest matching """ for i in range(0, l1 + 1): dp[i][0] = 0 for j in range(0, l2 + 1): dp[0][j] = 1 for i in range(0, l1): for j in range(0, l2): if matched_left[i][j] == -1: dp[i + 1][j + 1] = max(dp[i + 1][j], dp[i][j + 1]) if dp[i + 1][j + 1] == dp[i + 1][j]: trans_right[i + 1][j + 1] = j else: trans_left[i + 1][j + 1] = i else: l_len = matched_left[i][j] r_len = matched_right[i][j] dp[i + 1][j + 1] = max(max(dp[i + 1][j], dp[i][j + 1]), dp[i + 1 - l_len][j + 1 - r_len] + l_len) if dp[i + 1][j + 1] == dp[i + 1 - l_len][j + 1 - r_len] + l_len: trans_left[i + 1][j + 1] = i + 1 - l_len trans_right[i + 1][j + 1] = j + 1 - r_len assert l_len > 0 and r_len > 0 elif dp[i + 1][j + 1] == dp[i + 1][j]: trans_right[i + 1][j + 1] = j else: trans_left[i + 1][j + 1] = i i, j = l1, l2 matches = [] while i > 0 and j > 0: if trans_left[i][j] != -1 and trans_right[i][j] != -1: l = trans_left[i][j] r = trans_right[i][j] matches.append([(l, i - 1), (r, j - 1)]) i, j = l, r elif trans_left[i][j] < 0: j -= 1 else: i -= 1 matches.reverse() return matches def align_blending_model_logits_with_base_model_logits(base_examples, indices, blending_examples, blending_to_base_mapping, base_tokenizer, blending_tokenizer, blending_model_index, skip_align=False, align_strategy="greedy_dp"): """modified from https://github.com/fanqiwan/FuseAI/blob/main/FuseLLM/src/utils/token_alignment.py#L101""" base_features = [{key: base_examples[key][i] for key in base_examples} for i in range(len(base_examples[next(iter(base_examples))]))] blending_features = [blending_examples[idx] for idx in indices] aligned_per_step_logits_list, aligned_per_step_indices_list = [], [] per_step_logits_list, per_step_indices_list = [], [] metric_ce_aligned = [] for base_feature, blending_feature in zip(base_features, blending_features): base_feature[PER_STEP_LOGITS] = base_feature[PER_STEP_LOGITS][:len(base_feature['input_ids'])] base_feature[PER_STEP_INDICES] = base_feature[PER_STEP_INDICES][:len(base_feature['input_ids'])] blending_feature[PER_STEP_LOGITS] = blending_feature[PER_STEP_LOGITS][:len(blending_feature['input_ids'])] blending_feature[PER_STEP_INDICES] = blending_feature[PER_STEP_INDICES][:len(blending_feature['input_ids'])] if skip_align is True: aligned_blending_model_per_step_logits = blending_feature[PER_STEP_LOGITS] aligned_blending_model_per_step_indices = blending_feature[PER_STEP_INDICES] else: aligned_blending_model_per_step_logits, aligned_blending_model_per_step_indices = transform_step_logits( base_model_tokenizer=base_tokenizer, blending_model_tokenizer=blending_tokenizer, base_model_vocab=base_tokenizer.get_vocab(), base_model_input_ids=base_feature['input_ids'], blending_model_input_ids=blending_feature['input_ids'], blending_model_per_step_logits=blending_feature[PER_STEP_LOGITS], blending_model_per_step_indices=blending_feature[PER_STEP_INDICES], blending_to_base_mapping=blending_to_base_mapping, align_strategy=align_strategy ) aligned_per_step_logits_list.append(aligned_blending_model_per_step_logits) aligned_per_step_indices_list.append(aligned_blending_model_per_step_indices) per_step_logits_list.append(base_feature[PER_STEP_LOGITS]) per_step_indices_list.append(base_feature[PER_STEP_INDICES]) metric_ce_aligned.append(blending_feature[METRIC]) base_examples[PER_STEP_LOGITS] = per_step_logits_list base_examples[PER_STEP_INDICES] = per_step_indices_list base_examples[f"{ALIGNED_OTHER_LOGITS}_{blending_model_index}"] = aligned_per_step_logits_list base_examples[f"{ALIGNED_OTHER_INDICES}_{blending_model_index}"] = aligned_per_step_indices_list base_examples[f"{ALIGNED_OTHER_METRIC}_{blending_model_index}"] = metric_ce_aligned return base_examples def transform_step_logits(base_model_tokenizer: transformers.tokenization_utils_base.PreTrainedTokenizerBase, blending_model_tokenizer: transformers.tokenization_utils_base.PreTrainedTokenizerBase, base_model_vocab: Dict[str, int], base_model_input_ids: List[int], blending_model_input_ids: List[int], blending_model_per_step_logits: List[List[float]], blending_model_per_step_indices: List[List[int]], blending_to_base_mapping: Dict[str, str] = None, align_strategy: str = "dtw" ): """modified from https://github.com/fanqiwan/FuseAI/blob/main/FuseLLM/src/utils/others.py#L364""" """Align blending model per step logits & indices with base model.""" base_model_tokens = base_model_tokenizer.convert_ids_to_tokens(base_model_input_ids) blending_model_tokens = blending_model_tokenizer.convert_ids_to_tokens(blending_model_input_ids) base_model_special_token = TOKENIZER_TO_SPECIAL_TOKEN[base_model_tokenizer.__class__] blending_model_special_token = TOKENIZER_TO_SPECIAL_TOKEN[blending_model_tokenizer.__class__] aligned_blending_model_per_step_logits, aligned_blending_model_per_step_indices = [], [] if align_strategy == "dtw": def dist_fn(a, b): """Calculate editdistance between two tokens, a is from blending model, b is from base model.""" return editdistance.eval(a.replace(blending_model_special_token, ''), b.replace(base_model_special_token, '')) _, _, _, base_to_blending, _ = dtw(blending_model_tokens, base_model_tokens, norm_func=dist_fn) for i, blending_idx in enumerate(base_to_blending): aligned_blending_model_per_step_logit = [] aligned_blending_model_per_step_index = [] if len(blending_idx) == 1: # one base token map to one blending token j = blending_idx[0] base_token = base_model_tokens[i] blending_token = blending_model_tokens[j].replace(blending_model_special_token, base_model_special_token) if ( blending_model_tokenizer.__class__ == transformers.GPTNeoXTokenizerFast or blending_model_tokenizer.__class__ == transformers.GPT2TokenizerFast) and i == 0 and base_token.startswith( base_model_special_token) and not blending_token.startswith(base_model_special_token): blending_token = base_model_special_token + blending_token # special case for mpt if (base_token == blending_token) or ( blending_token in blending_to_base_mapping and base_token == blending_to_base_mapping[ blending_token]): # find the aligned mapping, use the corresponding logits # the logits and indices at this step for blending_logit, blending_index in zip(blending_model_per_step_logits[j], blending_model_per_step_indices[j]): # the token corresponds to the logit and indices blending_t = blending_model_tokenizer.convert_ids_to_tokens([blending_index])[0].replace( blending_model_special_token, base_model_special_token) blending_t = blending_to_base_mapping[blending_t] if blending_t in base_model_vocab: aligned_index = base_model_vocab[blending_t] # the index of the token in base model vocab if aligned_index not in aligned_blending_model_per_step_index: aligned_blending_model_per_step_index.append(aligned_index) aligned_blending_model_per_step_logit.append(blending_logit) else: logger.warning(f"blending_t: {blending_t} not in base_model_vocab!") else: # find error aligned mapping, use the one-hot logits aligned_blending_model_per_step_index.append(base_model_vocab[base_token]) aligned_blending_model_per_step_logit.append(1.0) else: # one base token map to multiple blending token, in this case only fit base token. use the one-hot logits base_token = base_model_tokens[i] aligned_blending_model_per_step_index.append(base_model_vocab[base_token]) aligned_blending_model_per_step_logit.append(1.0) aligned_blending_model_per_step_indices.append(aligned_blending_model_per_step_index) aligned_blending_model_per_step_logits.append(aligned_blending_model_per_step_logit) elif align_strategy == "greedy_dp": matches = greedy_dynamic_matching(base_model_tokens, blending_model_tokens, base_model_special_token, blending_model_special_token) fusion_logits = [[] for _ in range(len(matches))] fusion_indices = [[] for _ in range(len(matches))] match_pos = [-1] * len(base_model_tokens) used = [False] * len(matches) for idx, ((start_pos_1, end_pos_1), (start_pos_2, end_pos_2)) in enumerate(matches): fusion_dict = dict() fusion_counter_dict = dict() for blending_pos in range(start_pos_2, end_pos_2 + 1): for blending_logit, blending_index in zip(blending_model_per_step_logits[blending_pos], blending_model_per_step_indices[blending_pos]): if blending_index not in fusion_dict: fusion_dict[blending_index] = 0 fusion_counter_dict[blending_index] = 0 fusion_dict[blending_index] += blending_logit fusion_counter_dict[blending_index] += 1 for j in range(start_pos_1, end_pos_1 + 1): match_pos[j] = idx for token_index, token_logit in fusion_dict.items(): fusion_logits[idx].append(token_logit / fusion_counter_dict[token_index]) fusion_indices[idx].append(token_index) for i in range(len(base_model_tokens)): aligned_blending_model_per_step_logit = [] aligned_blending_model_per_step_index = [] if match_pos[i] == -1 or used[match_pos[i]]: base_token = base_model_tokens[i] aligned_blending_model_per_step_index.append(base_model_vocab[base_token]) aligned_blending_model_per_step_logit.append(1.0) else: pos = match_pos[i] used[pos] = True for blending_logit, blending_index in zip(fusion_logits[pos], fusion_indices[pos]): # the token corresponds to the logit and indices blending_t = blending_model_tokenizer.convert_ids_to_tokens([blending_index])[0].replace( blending_model_special_token, base_model_special_token) blending_t = blending_to_base_mapping[blending_t] if blending_t in base_model_vocab: aligned_index = base_model_vocab[blending_t] # the index of the token in base model vocab if aligned_index not in aligned_blending_model_per_step_index: aligned_blending_model_per_step_index.append(aligned_index) aligned_blending_model_per_step_logit.append(blending_logit) else: logger.warning(f"blending_t: {blending_t} not in base_model_vocab!") aligned_blending_model_per_step_indices.append(aligned_blending_model_per_step_index) aligned_blending_model_per_step_logits.append(aligned_blending_model_per_step_logit) else: raise ValueError(f"{align_strategy} not implemented yet.") return aligned_blending_model_per_step_logits, aligned_blending_model_per_step_indices def token_align( base_model_logits_datasets, blending_model_logits_dataset, base_tokenizer, blending_tokenizer, blending_to_base_mapping, blending_model_index, batch_size=4, preprocessing_num_workers=4, skip_align=False, align_strategy="dtw", ): assert len(base_model_logits_datasets) == len(blending_model_logits_dataset) base_model_blending_model_logits_datasets = base_model_logits_datasets.map( align_blending_model_logits_with_base_model_logits, batched=True, batch_size=batch_size, with_indices=True, num_proc=preprocessing_num_workers, load_from_cache_file=True, fn_kwargs={"blending_examples": blending_model_logits_dataset, "blending_to_base_mapping": blending_to_base_mapping, "base_tokenizer": base_tokenizer, "blending_tokenizer": blending_tokenizer, "blending_model_index": blending_model_index, "skip_align": skip_align, "align_strategy": align_strategy}, keep_in_memory=True, desc="Align blending model's logits with base model's logits.", ) return base_model_blending_model_logits_datasets ================================================ FILE: python/fate_llm/algo/fedmkt/token_alignment/vocab_mapping.py ================================================ # # NOTE: The find_best_mapping function is copied from FuseAI/FuseLLM # Copyright FuseAI/FuseLLM # # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json import editdistance import tqdm import multiprocessing import logging from fate_llm.data.tokenizers.cust_tokenizer import get_tokenizer from fate_llm.algo.fedmkt.token_alignment.spectal_token_mapping import TOKENIZER_TO_SPECIAL_TOKEN logger = logging.getLogger(__name__) def find_best_mapping(x, base_tokens, blending_model_special_token, base_model_special_token, best_one=True): """code refer to https://github.com/fanqiwan/FuseAI/blob/main/FuseLLM/src/utils/vocab_mapping.py#L82""" tmp_x = x.replace(blending_model_special_token, base_model_special_token) if tmp_x in base_tokens: return tmp_x, tmp_x else: if best_one: return tmp_x, min([(y, editdistance.eval(tmp_x, y)) for y in base_tokens], key=lambda d: d[1])[0] else: token_and_distance = [(y, editdistance.eval(tmp_x, y)) for y in base_tokens] min_distance = min(item[1] for item in token_and_distance) shortest_distance_tokens = [item[0] for item in token_and_distance if item[1] == min_distance] return tmp_x, shortest_distance_tokens def get_vocab_mappings(model_name_or_path, candidate_model_name_or_path, vocab_mapping_save_path, num_processors=8): ori_tokenizer = get_tokenizer(model_name_or_path) candidate_tokenizer = get_tokenizer(candidate_model_name_or_path) ori_special_tok = TOKENIZER_TO_SPECIAL_TOKEN[ori_tokenizer.__class__] candidate_special_tok = TOKENIZER_TO_SPECIAL_TOKEN[candidate_tokenizer.__class__] candidate_tokens = list(candidate_tokenizer.get_vocab().keys()) with multiprocessing.Pool(num_processors) as process_pool: func_args = [(tok, candidate_tokens, ori_special_tok, candidate_special_tok) for tok in ori_tokenizer.get_vocab()] vocab_mappings = dict(tqdm.tqdm(process_pool.starmap(find_best_mapping, func_args)), total=len(ori_tokenizer.get_vocab())) with open(vocab_mapping_save_path, "w") as fout: json.dump(vocab_mappings, fout) return vocab_mappings ================================================ FILE: python/fate_llm/algo/fedmkt/utils/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/algo/fedmkt/utils/dataset_sync_util.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import datasets import torch import torch.distributed as dist from fate_llm.algo.fedmkt.utils.vars_define import ( METRIC, PER_STEP_LOGITS, PER_STEP_INDICES, ) logger = logging.getLogger(__name__) def sync_dataset(dataset, local_rank, world_size, device): integer_keys_2d = ["input_ids", "attention_mask", "labels"] integer_keys_3d = [PER_STEP_INDICES] float_keys_3d = [PER_STEP_LOGITS] float_keys_1d = [METRIC] if local_rank == 0: for key in integer_keys_2d + integer_keys_3d + float_keys_3d + float_keys_1d: if key in integer_keys_2d or key in integer_keys_3d: dtype = torch.int32 else: dtype = torch.float64 values = dataset[key] v_tensor = torch.tensor(values, dtype=dtype).cuda(device) shape_tensor = torch.tensor(v_tensor.shape, dtype=torch.int32).cuda(device) shape_tensors = [shape_tensor for _ in range(world_size)] dist.scatter(shape_tensor, shape_tensors, async_op=False) v_tensors = [v_tensor for _ in range(world_size)] dist.scatter(v_tensor, v_tensors, async_op=False) return dataset else: data_dict = dict() for key in integer_keys_2d + integer_keys_3d + float_keys_3d + float_keys_1d: if key in integer_keys_2d or key in integer_keys_3d: dtype = torch.int32 else: dtype = torch.float64 if key in integer_keys_2d: shape_tensor = torch.tensor([0, 0], dtype=torch.int32).cuda(device) elif key in float_keys_3d or key in integer_keys_3d: shape_tensor = torch.tensor([0, 0, 0], dtype=torch.int32).cuda(device) else: shape_tensor = torch.tensor([0], dtype=torch.int32).cuda(device) dist.scatter(shape_tensor, src=0, async_op=False) v_tensor = torch.zeros(shape_tensor.tolist(), dtype=dtype).cuda(device) dist.scatter(v_tensor, src=0, async_op=False) data_dict[key] = v_tensor.tolist() return datasets.Dataset.from_dict(data_dict) ================================================ FILE: python/fate_llm/algo/fedmkt/utils/generate_logit_utils.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch import torch.nn.functional as F import gc from fate_llm.algo.fedmkt.utils.vars_define import ( PER_STEP_LOGITS, PER_STEP_INDICES, METRIC ) class Metric(object): @classmethod def cal_metric(cls, logits, input_ids, attention_mask, labels, training_args): if training_args.metric_type == "ce": return cls.cal_ce(logits, input_ids, attention_mask, labels, training_args) else: raise NotImplemented(f"metric={training_args.metric_type} is not implemented yet") @classmethod def cal_ce(cls, logits, input_ids, attention_mask, labels, training_args): metric = F.cross_entropy(logits[..., :-1, :].contiguous().view(-1, logits.size(-1)), labels[..., 1:].contiguous().view(-1), reduction="none").view(logits.size(0), -1) metric = (metric * attention_mask[..., 1:]).sum(dim=-1) / attention_mask[..., 1:].sum(dim=-1) return metric class LogitsSelection(object): @classmethod def select_logits(cls, logits, training_args): if training_args.top_k_strategy == "highest": return cls.select_highest(logits, training_args.top_k_logits_keep) else: raise NotImplemented(f"logits selection strategy={training_args.top_k_strategy} is not implemented") @classmethod def select_highest(cls, logits, top_k_logits_keep): top_k_logits, top_k_indices = torch.topk(logits.cuda(), k=top_k_logits_keep) logits.cpu() return top_k_logits, top_k_indices def generate_pub_data_logits(inputs, model, training_args, data_collator): input_keys = ["attention_mask", "input_ids", "labels"] inputs_per_batched = [dict() for _ in range(len(inputs[input_keys[1]]))] for key in input_keys: if key not in inputs: continue for idx, _in in enumerate(inputs[key]): inputs_per_batched[idx][key] = _in if "attention_mask" not in inputs: for idx in range(len(inputs_per_batched)): inputs_per_batched[idx]["attention_mask"] = [1] * len(inputs_per_batched[idx]["input_ids"]) inputs_per_batched = data_collator(inputs_per_batched) input_ids = inputs_per_batched["input_ids"] attention_mask = inputs_per_batched["attention_mask"] labels = inputs_per_batched["labels"] device = next(model.parameters()).device if device.type == "cuda": input_ids = input_ids.cuda(device) attention_mask = attention_mask.cuda(device) labels = labels.cuda(device) model.eval() with torch.no_grad(): logits = model(input_ids=input_ids, attention_mask=attention_mask).logits metric = Metric.cal_metric(logits, input_ids, attention_mask, labels, training_args) input_ids.cpu() del input_ids attention_mask.cpu() del attention_mask labels.cpu() del labels logits.cpu() metric.cpu() if training_args.top_k_logits_keep is None: raise ValueError("Please specify top_k_logits_keep, fulling save will leak to memory exceeds") selected_logits, selected_indices = LogitsSelection.select_logits(logits=logits, training_args=training_args) selected_logits.cpu() selected_indices.cpu() inputs[PER_STEP_LOGITS] = selected_logits inputs[PER_STEP_INDICES] = selected_indices inputs[METRIC] = metric del logits gc.collect() model.train() return inputs ================================================ FILE: python/fate_llm/algo/fedmkt/utils/tokenizer_tool.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import AutoConfig def get_vocab_size(tokenizer_name_or_path): if tokenizer_name_or_path is not None: return AutoConfig.from_pretrained(tokenizer_name_or_path) ================================================ FILE: python/fate_llm/algo/fedmkt/utils/vars_define.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # PER_STEP_LOGITS = "per_step_logits" PER_STEP_INDICES = "per_step_indices" METRIC = "metric" ALIGNED_OTHER_LOGITS = "aligned_other_logits" ALIGNED_OTHER_INDICES = "aligned_other_indices" ALIGNED_OTHER_METRIC = "aligned_other_metrice" SELF_TARGET_DIST = "llm_target_distribution" OTHER_TARGET_DIST = "slm_target_distribution" INPUT_KEYS = {"input_ids", "attention_mask", "labels"} ================================================ FILE: python/fate_llm/algo/inferdpt/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/inferdpt/_encode_decode.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate.arch import Context from typing import List, Dict import logging logger = logging.getLogger(__name__) class EncoderDecoder(object): def __init__(self, ctx: Context) -> None: self.ctx = ctx def encode(self, docs: List[Dict[str, str]], format_template: str): pass def decode(self, docs: List[Dict[str, str]], format_template: str ): pass def inference(self, docs: List[Dict[str, str]], inference_kwargs: dict = {}, format_template: str = None): pass ================================================ FILE: python/fate_llm/algo/inferdpt/inferdpt.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import copy from jinja2 import Template from tqdm import tqdm from fate.arch import Context from typing import List, Dict, Union from fate.ml.nn.dataset.base import Dataset from fate_llm.algo.inferdpt.utils import InferDPTKit from openai import OpenAI import logging from fate_llm.inference.inference_base import Inference from fate_llm.algo.inferdpt._encode_decode import EncoderDecoder from fate_llm.dataset.hf_dataset import HuggingfaceDataset logger = logging.getLogger(__name__) class InferDPTClient(EncoderDecoder): def __init__(self, ctx: Context, inferdpt_pertub_kit: InferDPTKit, local_inference_inst: Inference, epsilon: float = 3.0,) -> None: self.ctx = ctx self.kit = inferdpt_pertub_kit assert epsilon > 0, 'epsilon must be a float > 0' self.ep = epsilon self.comm_idx = 0 self.local_inference_inst = local_inference_inst def encode(self, docs: List[Dict[str, str]], format_template: str = None, verbose=False, perturb_doc_key: str ='perturbed_doc') -> List[Dict[str, str]]: copy_docs = copy.deepcopy(docs) if format_template is not None: template = Template(format_template) else: template = None for doc in tqdm(copy_docs): if template is None: rendered_doc = str(doc) else: rendered_doc = template.render(**doc) if verbose: logger.debug('doc to perturb {}'.format(rendered_doc)) p_doc = self.kit.perturb(rendered_doc, self.ep) doc[perturb_doc_key] = p_doc return copy_docs def _remote_inference(self, docs: List[Dict[str, str]], inference_kwargs: dict = {}, format_template: str = None, perturbed_response_key: str = 'perturbed_response', verbose=False ) -> List[Dict[str, str]]: copy_docs = copy.deepcopy(docs) if format_template is not None: template = Template(format_template) else: template = None infer_docs = [] for doc in tqdm(copy_docs): if template is None: rendered_doc = str(doc) else: rendered_doc = template.render(**doc) if verbose: logger.debug('inference doc {}'.format(rendered_doc)) infer_docs.append(rendered_doc) doc['perturbed_doc_with_instrcution'] = rendered_doc self.ctx.arbiter.put('client_data_{}'.format(self.comm_idx), (infer_docs, inference_kwargs)) perturb_resp = self.ctx.arbiter.get('pdoc_{}'.format(self.comm_idx)) self.comm_idx += 1 for pr, doc in zip(perturb_resp, copy_docs): doc[perturbed_response_key] = pr return copy_docs def decode(self, p_docs: List[Dict[str, str]], instruction_template: str = None, decode_template: str = None, verbose=False, perturbed_response_key: str = 'perturbed_response', result_key: str = 'inferdpt_result', remote_inference_kwargs: dict = {}, local_inference_kwargs: dict = {}): # inference using remote large models docs_with_infer_result = self._remote_inference(p_docs, format_template=instruction_template, verbose=verbose, inference_kwargs=remote_inference_kwargs, perturbed_response_key=perturbed_response_key) if decode_template is not None: dt = Template(decode_template) doc_to_decode = [dt.render(**i) for i in docs_with_infer_result] else: doc_to_decode = [str(i) for i in docs_with_infer_result] # local model decode final_result = self.local_inference_inst.inference(doc_to_decode, local_inference_kwargs) for final_r, d in zip(final_result, docs_with_infer_result): d[result_key] = final_r return docs_with_infer_result def inference(self, docs: Union[List[Dict[str, str]], HuggingfaceDataset], encode_template: str, instruction_template: str, decode_template: str, verbose: bool = False, remote_inference_kwargs: dict = {}, local_inference_kwargs: dict = {}, perturb_doc_key: str = 'perturbed_doc', perturbed_response_key: str = 'perturbed_response', result_key: str = 'inferdpt_result', ) -> List[Dict[str, str]]: assert (isinstance(docs, list) and isinstance(docs[0], dict)) or isinstance(docs, HuggingfaceDataset), 'Input doc must be a list of dict or HuggingfaceDataset' # perturb doc if isinstance(docs, HuggingfaceDataset): docs = [docs[i] for i in range(len(docs))] docs_with_p = self.encode(docs, format_template=encode_template, verbose=verbose, perturb_doc_key=perturb_doc_key) logger.info('encode done') # inference using perturbed doc final_result = self.decode( docs_with_p, instruction_template, decode_template, verbose, perturbed_response_key, result_key, remote_inference_kwargs, local_inference_kwargs, ) logger.info('decode done') return final_result class InferDPTServer(object): def __init__(self, ctx: Context, inference_inst: Inference) -> None: self.ctx = ctx self.inference_inst = inference_inst self.comm_idx = 0 def inference(self, verbose=False): client_data = self.ctx.guest.get('client_data_{}'.format(self.comm_idx)) perturbed_docs, inference_kwargs = client_data if verbose: logger.info('got data {}'.format(client_data)) logger.info('start inference') rs_doc = self.inference_inst.inference(perturbed_docs, inference_kwargs) self.ctx.guest.put('pdoc_{}'.format(self.comm_idx), rs_doc) self.comm_idx += 1 def predict(self): self.inference() ================================================ FILE: python/fate_llm/algo/inferdpt/init/_init.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate.arch import Context from typing import Union class InferInit(object): def __init__(self, ctx: Context): self.ctx = ctx def get_inst(self): pass ================================================ FILE: python/fate_llm/algo/inferdpt/init/default_init.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.algo.inferdpt.init._init import InferInit from fate_llm.inference.api import APICompletionInference from fate_llm.algo.inferdpt import inferdpt from fate_llm.algo.inferdpt.utils import InferDPTKit from fate_llm.algo.inferdpt.inferdpt import InferDPTClient, InferDPTServer class InferDPTAPIClientInit(InferInit): api_url = '' api_model_name = '' api_key = 'EMPTY' inferdpt_kit_path = '' eps = 3.0 def __init__(self, ctx): super().__init__(ctx) self.ctx = ctx def get_inst(self)-> InferDPTClient: inference = APICompletionInference(api_url=self.api_url, model_name=self.api_model_name, api_key=self.api_key) kit = InferDPTKit.load_from_path(self.inferdpt_kit_path) inferdpt_client = inferdpt.InferDPTClient(self.ctx, kit, inference, epsilon=self.eps) return inferdpt_client class InferDPTAPIServerInit(InferInit): api_url = '' api_model_name = '' api_key = 'EMPTY' def __init__(self, ctx): super().__init__(ctx) self.ctx = ctx def get_inst(self)-> InferDPTServer: inference = APICompletionInference(api_url=self.api_url, model_name=self.api_model_name, api_key=self.api_key) inferdpt_server = inferdpt.InferDPTServer(self.ctx,inference_inst=inference) return inferdpt_server ================================================ FILE: python/fate_llm/algo/inferdpt/utils.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Parts of the codes are modified from https://github.com/mengtong0110/InferDPT """ from decimal import getcontext from transformers import AutoTokenizer import numpy as np import json import tqdm from typing import List getcontext().prec = 100 class NumpyEncoder(json.JSONEncoder): """ Special json encoder for numpy types """ def default(self, obj): if isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() return json.JSONEncoder.default(self, obj) def save_jsonl(filename, data): with open(filename, 'w') as file: for item in data: json.dump(item, file) file.write('\n') def create_sensitivity_of_embeddings(all_embedding_matrix): n_dimensions = all_embedding_matrix.shape[1] delta_f_new = np.zeros(n_dimensions) for dim in tqdm.trange(n_dimensions): dim_data = all_embedding_matrix[:, dim] sorted_dim_data = np.sort(dim_data) differences = sorted_dim_data[-1] - sorted_dim_data[0] delta_f_new[dim] = differences return delta_f_new def create_sorted_embedding_matrix(token_list, similarity_matrix): token_2_sorted_distances = dict() token_array = np.array(token_list) for idx, token in tqdm.tqdm(enumerate(token_list)): similarity_array = similarity_matrix[idx] sorted_indices = np.argsort(similarity_array)[::-1] token_2_sorted_distances[token] = [token_array[sorted_indices].tolist(), similarity_array[sorted_indices].tolist()] return token_2_sorted_distances def cosine_similarity_vectors(A, B): dot_product = np.dot(A, B) norm_a = np.linalg.norm(A) norm_b = np.linalg.norm(B) similarity = dot_product / (norm_a * norm_b) return similarity class InferDPTKit(object): def __init__(self, token_to_vector_dict, sorted_similarities, delta_f, tokenizer) -> None: self.token_to_vector_dict = token_to_vector_dict self.sorted_similarities = sorted_similarities self.delta_f = delta_f self.tokenizer = tokenizer assert len(token_to_vector_dict) == len(sorted_similarities) def save_to_path(self, path): # make folder import os if not os.path.exists(path+'/inferdpt_kit'): os.makedirs(path+'/inferdpt_kit') with open(path+'/inferdpt_kit/token_2_vector.json', 'w', encoding='utf8') as f: json.dump(self.token_to_vector_dict, f, ensure_ascii=False, cls=NumpyEncoder) with open(path+'/inferdpt_kit/sorted_similarities.json', 'w') as f: json.dump(self.sorted_similarities, f, cls=NumpyEncoder) with open(path+'/inferdpt_kit/delta_f.json', 'w') as f: json.dump(self.delta_f, f, cls=NumpyEncoder) self.tokenizer.save_pretrained(path+'/inferdpt_kit/tokenizer/') @staticmethod def make_inferdpt_kit_param(embedding_matrix: np.ndarray, token_list: List[str]): def cosine_simi(embedding_matrix1, embedding_matrix2): dot_product = np.dot(embedding_matrix1, embedding_matrix2.T) norm_matrix1 = np.linalg.norm(embedding_matrix1, axis=1) norm_matrix2 = np.linalg.norm(embedding_matrix2, axis=1) similarity = dot_product / (np.outer(norm_matrix1, norm_matrix2)) return similarity assert len(embedding_matrix) == len(token_list) similarity_matrix = cosine_simi(embedding_matrix, embedding_matrix) token_sorted_distance_dict = create_sorted_embedding_matrix(token_list, similarity_matrix) delta_f_new = create_sensitivity_of_embeddings(embedding_matrix) token_2_embedding = {} for token, embedding in zip(token_list, embedding_matrix): token_2_embedding[token] = embedding return token_2_embedding, token_sorted_distance_dict, delta_f_new @staticmethod def load_from_path(path): with open(path+'/inferdpt_kit/token_2_vector.json', 'r', encoding='utf8') as f: token_to_vector_dict = json.load(f) with open(path+'/inferdpt_kit/sorted_similarities.json', 'r') as f: sorted_similarities = json.load(f) with open(path+'/inferdpt_kit/delta_f.json', 'r') as f: delta_f = np.array(json.load(f)) tokenizer = AutoTokenizer.from_pretrained(path+'/inferdpt_kit/tokenizer/') inferdpt_kit = InferDPTKit(token_to_vector_dict, sorted_similarities, delta_f, tokenizer) return inferdpt_kit def perturb(self, doc: str, epsilon: float) -> str: # epsilon > 0 assert epsilon > 0, "epsilon should be greater than 0" tokenizer = self.tokenizer tokens = tokenizer.tokenize(doc) new_tokens = [] Delta_u = 1.0 exp_factor = epsilon / (2 * Delta_u) for origin_token in tokens: if origin_token[0] == ' ': origin_token = origin_token[1:] origin_embed = self.token_to_vector_dict.get(origin_token, None) if origin_embed is None: new_tokens.append(origin_token) continue noise_embed = add_laplace_noise_to_vector(origin_embed, epsilon, self.delta_f) similarity = cosine_similarity_vectors(origin_embed, noise_embed) sorted_distances_for_token = self.sorted_similarities.get(origin_token, None) if sorted_distances_for_token is None: continue token_only = sorted_distances_for_token[0] similarity_only = sorted_distances_for_token[1] arr = np.flip(similarity_only) index = np.searchsorted(arr, similarity) index = len(arr) - index close_tokens = token_only[:index] close_similarities = similarity_only[:index] if len(close_tokens) == 0: continue unnormalized_probabilities = np.exp(exp_factor * np.array(close_similarities)) total_unnormalized_prob = np.sum(unnormalized_probabilities) probabilities = unnormalized_probabilities / total_unnormalized_prob selected_token = np.random.choice(close_tokens, p=probabilities) new_tokens.append(selected_token) token_ids = tokenizer.convert_tokens_to_ids(new_tokens) sentence = tokenizer.decode(token_ids) return sentence def cosine_similarity_vectors(A, B): dot_product = np.dot(A, B) norm_a = np.linalg.norm(A) norm_b = np.linalg.norm(B) similarity = dot_product / (norm_a * norm_b) return similarity def add_laplace_noise_to_vector(vector, epsilon, delta_f_new): vector = np.asarray(vector, dtype=np.longdouble) if epsilon == 0: beta_values = delta_f_new * 0 else: beta_values = delta_f_new / (0.5 * epsilon) noise = np.random.laplace(loc=0, scale=beta_values, size=len(beta_values)) noisy_vector = vector + noise return noisy_vector def perturb_sentence(sent, epsilon, tokenizer, token_to_vector_dict, sorted_distance_data, delta_f_new): tokens = tokenizer.tokenize(sent) new_tokens = [] Delta_u = 1.0 exp_factor = epsilon / (2 * Delta_u) for origin_token in tokens: if origin_token[0] == ' ': origin_token = origin_token[1:] origin_embed = token_to_vector_dict.get(origin_token, None) if origin_embed is None: new_tokens.append(origin_token) continue noise_embed = add_laplace_noise_to_vector(origin_embed, epsilon, delta_f_new) similarity = cosine_similarity_vectors(origin_embed, noise_embed) sorted_distances_for_token = sorted_distance_data.get(origin_token, None) if sorted_distances_for_token is None: continue token_only = sorted_distances_for_token[0] similarity_only = sorted_distances_for_token[1] arr = np.flip(similarity_only) index = np.searchsorted(arr, similarity) index = len(arr) - index close_tokens = token_only[:index] close_similarities = similarity_only[:index] if len(close_tokens) == 0: continue unnormalized_probabilities = np.exp(exp_factor * np.array(close_similarities)) total_unnormalized_prob = np.sum(unnormalized_probabilities) probabilities = unnormalized_probabilities / total_unnormalized_prob selected_token = np.random.choice(close_tokens, p=probabilities) new_tokens.append(selected_token) token_ids = tokenizer.convert_tokens_to_ids(new_tokens) sentence = tokenizer.decode(token_ids) return sentence ================================================ FILE: python/fate_llm/algo/offsite_tuning/__init__.py ================================================ ================================================ FILE: python/fate_llm/algo/offsite_tuning/offsite_tuning.py ================================================ from fate.ml.aggregator.base import Aggregator from fate_llm.algo.fedavg.fedavg import Seq2SeqFedAVGClient, Seq2SeqFedAVGServer, Seq2SeqTrainingArguments from fate.ml.nn.trainer.trainer_base import FedArguments, TrainingArguments from typing import List, Optional, Callable, Tuple from fate.arch import Context from torch.optim import Optimizer from torch.utils.data import DataLoader, Dataset from torch.optim.lr_scheduler import _LRScheduler from transformers.trainer_callback import TrainerCallback from torch.nn import Module from transformers import TrainerState, TrainerControl, PreTrainedTokenizer from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningBaseModel import logging import torch import torch.distributed as dist from transformers.modeling_utils import unwrap_model logger = logging.getLogger(__name__) class OffsiteTuningTrainerClient(Seq2SeqFedAVGClient): def __init__( self, ctx: Context, model: OffsiteTuningBaseModel, training_args: Seq2SeqTrainingArguments, fed_args: FedArguments, train_set: Dataset, val_set: Dataset = None, optimizer: Optimizer = None, scheduler: _LRScheduler = None, data_collator: Callable = None, tokenizer: Optional[PreTrainedTokenizer] = None, callbacks: List[TrainerCallback] = [], compute_metrics: Callable = None, aggregate_model: bool = False, save_trainable_weights_only: bool = False, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, ): assert isinstance(model, OffsiteTuningBaseModel), "model must be the subclass of OffsiteTuningBaseModel" if aggregate_model == False and fed_args is None: fed_args = FedArguments() elif fed_args is None: raise ValueError("fed_args must be provided when aggregate_model is True") local_mode = True if not aggregate_model else False super().__init__( ctx, model, training_args, fed_args, train_set, val_set, optimizer, scheduler, data_collator, tokenizer, callbacks, compute_metrics, local_mode, save_trainable_weights_only, preprocess_logits_for_metrics ) self._aggregate_model = aggregate_model def _share_model(self, model, args: Seq2SeqTrainingArguments, sync_trainable_only=True): if args.local_rank == 0: for p in model.parameters(): if (not sync_trainable_only) or (sync_trainable_only and p.requires_grad): scatter_list = [p.data for _ in range(args.world_size)] dist.scatter(p.data, scatter_list, async_op=False) else: for p in model.parameters(): if (not sync_trainable_only) or (sync_trainable_only and p.requires_grad): dist.scatter(p.data, src=0, async_op=False) def on_train_begin(self, ctx: Context, aggregator: Aggregator, fed_args: FedArguments, args: TrainingArguments, model: Module = None, optimizer: Optimizer = None, scheduler: _LRScheduler = None, dataloader: Tuple[DataLoader]= None, control: TrainerControl= None, state: TrainerState = None, **kwargs): if args.local_rank == 0: # master logger.info('receving weights from server') parameters_to_get = ctx.arbiter.get('sub_model_para') model = unwrap_model(model) model.load_submodel_weights(parameters_to_get) logger.info('received submodel weigths from the server') if args.world_size > 1: self._share_model(model, args) logger.info('sharing model parameters done') else: if args.world_size > 1: model = unwrap_model(model) self._share_model(model, args) logger.info('sharing model parameters done') def on_federation( self, ctx: Context, aggregator, fed_args: FedArguments, args: TrainingArguments, model: Optional[OffsiteTuningBaseModel] = None, optimizer: Optional[Optimizer] = None, scheduler: Optional[_LRScheduler] = None, dataloader: Optional[Tuple[DataLoader]] = None, control: Optional[TrainerControl] = None, state: Optional[TrainerState] = None, **kwargs, ): if self._aggregate_model: aggregator.model_aggregation(ctx, model) def on_train_end(self, ctx: Context, aggregator: Aggregator, fed_args: FedArguments, args: TrainingArguments, model: OffsiteTuningBaseModel = None, optimizer: Optimizer = None, scheduler: _LRScheduler = None, dataloader: Tuple[DataLoader]= None, control: TrainerControl= None, state: TrainerState = None, **kwargs): if args.local_rank == 0: if args.world_size > 1: model = unwrap_model(model) return_weights = model.get_submodel_weights(with_emulator=False) ctx.arbiter.put('trained_sub_model_para', return_weights) logger.info('weights sent back to the server') def init_aggregator(self, ctx: Context, fed_args: FedArguments): if self._aggregate_model: return super().init_aggregator(ctx, fed_args) else: return None class OffsiteTuningTrainerServer(Seq2SeqFedAVGServer): def __init__(self, ctx: Context, model: OffsiteTuningBaseModel, aggregate_model=False) -> None: self._aggregate_model = aggregate_model super().__init__(ctx, local_mode=False) assert isinstance(model, OffsiteTuningBaseModel), "model must be the subclass of OffsiteTuningBaseModel" self.model = model def on_train_begin(self, ctx: Context, aggregator: Aggregator): logger.info('sending weights to clients') parameters_to_send = self.model.get_submodel_weights() ctx.guest.put('sub_model_para', parameters_to_send) if any(p.role=='host' for p in ctx.parties): ctx.hosts.put('sub_model_para', parameters_to_send) def on_train_end(self, ctx: Context, aggregator: Aggregator): parameters_to_get = ctx.guest.get('trained_sub_model_para') self.model.load_submodel_weights(parameters_to_get, with_emulator=False) logger.info('received trained submodel weigths from the client') def on_federation(self, ctx: Context, aggregator, agg_iter_idx: int): if self._aggregate_model: aggregator.model_aggregation(ctx) else: logger.info('skip aggregation') def init_aggregator(self, ctx): if self._aggregate_model: return super().init_aggregator(ctx) else: return None def train(self): if self._aggregate_model: super().train() else: # do nothing but send the submodel weights to the client # and then aggregate the weights from the client self.on_init_end(self.ctx, aggregator=self.aggregator) self.on_train_begin(self.ctx, aggregator=self.aggregator) self.on_train_end(self.ctx, aggregator=self.aggregator) def save_model( self, output_dir: Optional[str] = None, state_dict=None ): import torch import os if not os.path.exists(output_dir): os.makedirs(output_dir) torch.save(self.model.state_dict(), output_dir + '/pytorch_model.bin') ================================================ FILE: python/fate_llm/algo/ppc-gpt/__init__.py ================================================ ================================================ FILE: python/fate_llm/data/__init__.py ================================================ ================================================ FILE: python/fate_llm/data/data_collator/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/data/data_collator/cust_data_collator.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers.data import data_collator from ..tokenizers.cust_tokenizer import get_tokenizer def get_data_collator(data_collator_name, tokenizer_name_or_path=None, pad_token=None, bos_token=None, eos_token=None, pad_token_id=None, bos_token_id=None, eos_token_id=None, trust_remote_code=False, **kwargs): if not hasattr(data_collator, data_collator_name): support_collator_list = list(filter(lambda module_name: "collator" in module_name.lower(), dir(data_collator))) return ValueError(f"data_collator's name={data_collator_name} does not in support list={support_collator_list}") tokenizer = get_tokenizer(tokenizer_name_or_path=tokenizer_name_or_path, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, trust_remote_code=trust_remote_code) return getattr(data_collator, data_collator_name)(tokenizer, **kwargs) def get_seq2seq_data_collator(tokenizer_name_or_path, **kwargs): return get_data_collator("DataCollatorForSeq2Seq", tokenizer_name_or_path=tokenizer_name_or_path, **kwargs) ================================================ FILE: python/fate_llm/data/data_collator/fedcot_collator.py ================================================ from transformers import DataCollatorForSeq2Seq from transformers import AutoTokenizer import pandas as pd class PrefixDataCollator(DataCollatorForSeq2Seq): def __call__(self, features, return_tensors=None): features_df = pd.DataFrame(features) cot = super().__call__(list(features_df['predict']), return_tensors) label = super().__call__(list(features_df['rationale']), return_tensors) return { 'predict': cot, 'rationale': label } def get_prefix_data_collator(tokenizer_name_or_path): tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) data_collator = PrefixDataCollator(tokenizer) return data_collator ================================================ FILE: python/fate_llm/data/tokenizers/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/data/tokenizers/cust_tokenizer.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import AutoTokenizer def get_tokenizer( tokenizer_name_or_path, trust_remote_code=False, padding_side=None, pad_token=None, bos_token=None, eos_token=None, pad_token_id=None, bos_token_id=None, eos_token_id=None, add_eos_token=True, ): tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, trust_remote_code=trust_remote_code, add_eos_token=add_eos_token ) if padding_side is not None: tokenizer.padding_side = padding_side if pad_token is not None: tokenizer.add_special_tokens({'pad_token': pad_token}) if bos_token is not None: tokenizer.add_special_tokens({'bos_token': bos_token}) if eos_token is not None: tokenizer.add_special_tokens({"eos_token": eos_token}) if pad_token_id is not None: tokenizer.pad_token_id = pad_token_id if bos_token_id is not None: tokenizer.bos_token_id = bos_token_id if eos_token_id is not None: tokenizer.eos_token_id = eos_token_id if "llama" in tokenizer_name_or_path.lower() or "gpt2" in tokenizer_name_or_path.lower(): tokenizer.pad_token = tokenizer.eos_token return tokenizer ================================================ FILE: python/fate_llm/dataset/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/dataset/data_config/__init__.py ================================================ import os # absolute path to current directory parent_dir = os.path.dirname(os.path.realpath(__file__)) DATA_CONFIG_TEMPLATE = {"ag_news": os.path.join(parent_dir, "default_ag_news.yaml"), "yelp_review": os.path.join(parent_dir, "default_yelp_review.yaml"),} ================================================ FILE: python/fate_llm/dataset/data_config/default_ag_news.yaml ================================================ dataset_kwargs: data_files: ag_news_review/AGnews/train.json dataset_path: json doc_to_target: '{{label}}' metric_list: - aggregation: mean higher_is_better: true metric: accuracy output_type: generate_until task: ag-news validation_split: train label_key: label text_key: text sub_domain: AGnews few_shot_num_per_label: 2 tokenize_format: "Product type: {{sub_domain}} | Text Category: {{label}}" few_shot_format: "- : {{label}}.\n- : {{text}}\n\n" augment_format: "The news' topics belong to the following 4 categories: 0.world 1.sports 2.business 3.science and technology. Please generate news according to the following format, bearing in mind that the generated results should not resemble the examples, but should align with the specified category: \n" text_with_label_format: "******\n {{i}}.\nNews: {{text}}\nCategory: {{label}}.\n" filter_format: "I will give you some news samples with their categories, The news' topics belong to the following 4 categories: 0.world 1.sports 2.business 3.science and technology. the samples are delimited by '******':\n {text_with_label} Please filter out texts that are ambiguous, do not belong to news or do not meet the categories, and leave news texts that meet the categories.\n You should also filter out news text that are too similar to other samples and keep the most representative ones. Your answer should begin with 'The eligible samples:\n\n' and the indexes of the texts you choose, use spaces to separate the indexes and do not provide duplicate indices or indices that exceed the maximum index of samples." label_list: - 'world' - 'sports' - 'business' - 'science and technology' ================================================ FILE: python/fate_llm/dataset/data_config/default_yelp_review.yaml ================================================ dataset_kwargs: data_files: yelp_review/Health/train.json dataset_path: json doc_to_target: '{{label}}' metric_list: - aggregation: mean higher_is_better: true metric: accuracy output_type: generate_until task: yelp-review label_key: stars text_key: text validation_split: train sub_domain: Health few_shot_num_per_label: 2 tokenize_format: "Product type: {{sub_domain}} | Review Score: {{label}}" text_with_label_format: "******\n {{i}}.\nReview: {{text}}\nRating stars: {{label}}.\n" few_shot_format: "******\n- : {{label}} stars.\n- : {{text}}\n\n" augment_format: "The reviews are rated from 1 to 5 stars, with 1 being the worst, 3 being neutral and 5 being the best. Please generate more similar samples for each rating star about the Health domain as shown in the following format, bearing in mind that the generated results should not copy or resemble the examples, and should align with the {{sub_domain}} domain and the rating stars.\nThe examples are delimited by '******'." filter_format: "I will give you some customer review text samples with their rating stars, these samples are indexed starting from 0, the samples are delimited by '******':\n {{text_with_label}}. These reviews gradually shift from negative to positive from 1 star to 5 stars. 1 star represents the worst, 2 stars are better than 1 star, but still indicate a negative review. 3 stars represent a neutral review. 4 stars indicate a positive review, but less positive than 5 stars. 5 stars represent perfection.\n Please filter out text that does not belong to customer reviews or does not meet the rating stars, and leave review texts that meet the labels.\n You should also filter out text that are too similar to other samples and keep the most representative ones. Your answer should begin with 'The eligible samples:\n\n' and the indexes of the texts you choose, use spaces to separate the indexes and do not provide duplicate indices or indices that exceed the maximum index of samples." label_list: - 1 - 2 - 3 - 4 - 5 ================================================ FILE: python/fate_llm/dataset/fedcot_dataset.py ================================================ from fate_llm.dataset.input_output_dataset import InputOutputDataset from transformers.trainer_pt_utils import LabelSmoother from typing import List, Dict, Union, Literal import logging from jinja2 import Template from transformers import AutoTokenizer logger = logging.getLogger(__name__) class PrefixDataset(InputOutputDataset): def __init__(self, tokenizer_path, predict_input_template: str, predict_output_template: str, rationale_input_template: str, rationale_output_template: str, max_input_length: int = 256, max_target_length: int = 256, load_from: Literal['jsonl', 'hf_load_from_disk', 'hf_load_dataset'] = 'hf_load_from_disk', split_key: str = None ): super().__init__(tokenizer_path, predict_input_template, predict_output_template, max_input_length, max_target_length, load_from, split_key) self.r_input_template = Template(rationale_input_template) self.r_output_template = Template(rationale_output_template) def load_rationale(self, result_list, key='rationale'): for d, r in zip(self.dataset, result_list): d[key] = r def get_str_item(self, i) -> dict: data_item = self.dataset[i] p_in = self.input_template.render(data_item) p_out = self.output_template.render(data_item) r_in = self.r_input_template.render(data_item) r_out = self.r_output_template.render(data_item) ret_dict = { 'predict':{ 'input': p_in, 'output': p_out }, 'rationale':{ 'input': r_in, 'output': r_out } } return ret_dict def get_tokenized_item(self, i) -> dict: str_item = self.get_str_item(i) ret_dict = { 'predict': self._process_item(str_item['predict']), 'rationale': self._process_item(str_item['rationale']) } return ret_dict ================================================ FILE: python/fate_llm/dataset/flex_dataset.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import pickle import re from datasets import load_dataset from fastchat.model import get_conversation_template from jinja2 import Template from ruamel import yaml from transformers import AutoTokenizer from typing import Union, Literal from fate.ml.nn.dataset.base import Dataset from fate_llm.dataset.data_config import DATA_CONFIG_TEMPLATE logger = logging.getLogger(__name__) """ Implementation of FDKT augmentation process, adopted from https://arxiv.org/abs/2405.14212 """ def get_jinjax_placeholders(jinjax_text, placeholder_count=2): pattern = r"<([^>]+)>" matches = re.findall(pattern, jinjax_text) return matches[:placeholder_count] def regex_replace(string, pattern, repl, count: int = 0): """ adopted from lm-evaluation-harness/lm-eval/utils.py for offline use Parameters ---------- string pattern repl count Returns ------- """ return re.sub(pattern, repl, string, count=count) def apply_template(template, data): """ adopted from lm-evaluation-harness/lm-eval/utils.py for offline use Parameters ---------- template data Returns ------- """ return Template(template).render(data) def tokenize_flex_dataset(raw_datasets, tokenizer, sub_domain, tokenize_format, text_key, label_key, data_part="train", save_path=None, max_prompt_len=256): tokenizer.pad_token = tokenizer.eos_token column_names = raw_datasets[data_part].column_names def tokenize_function(examples): texts = tokenizer(examples[text_key]) label_processed = [apply_template(tokenize_format,{"sub_domain": sub_domain,"label": label}) for label in examples[label_key]] labels = tokenizer(label_processed) input_ids = [i2 + i1 for i1, i2 in zip(texts['input_ids'], labels['input_ids'])] attention_mask = [i2 + i1 for i1, i2 in zip(texts['attention_mask'], labels['attention_mask'])] """ cut off max prompt length """ input_ids = [t[: max_prompt_len] for t in input_ids] attention_mask = [t[: max_prompt_len] for t in attention_mask] out = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids} return out tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=4, remove_columns=column_names, desc="Running tokenizer on dataset", ) if save_path is not None: tokenized_datasets.save_to_disk(save_path) return tokenized_datasets class FlexDataset(Dataset): def __init__(self, tokenizer_name_or_path, dataset_name: str, load_from: Literal['json'] = 'json', data_part: str = None, config: Union[dict, str] = None, need_preprocess: bool = True, random_state: int = None, max_prompt_len: int = 256, select_num: int = None, few_shot_num_per_label: int = None ): super().__init__() self.tokenizer = None self.tokenizer_name_or_path = tokenizer_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name_or_path, trust_remote_code=True) self.dataset_name = dataset_name if self.dataset_name and config is None: config = DATA_CONFIG_TEMPLATE.get(self.dataset_name, "") self.load_from = load_from self.data_part = data_part self.random_state = random_state self.need_preprocess = need_preprocess self.max_prompt_len = max_prompt_len self.select_num = select_num self.dataset = None self.ds = None self.label_key = None self.text_key = None self.augment_format = None self.filter_format = None self.few_shot_format = None self.tokenize_format = None self.sub_domain = None self.label_list = None self.text_with_label_format = None self.few_shot_num_per_label = few_shot_num_per_label self.config = config if isinstance(config, str): with open(config, 'r') as f: self.config = yaml.safe_load(f) self.parse_config() def parse_config(self, config=None): if config is None: config = self.config self.label_key = config.get("label_key", None) self.text_key = config.get("text_key", None) self.augment_format = config.get("augment_format", None) self.filter_format = config.get("filter_format", None) self.tokenize_format = config.get("tokenize_format", None) self.sub_domain = config.get("sub_domain", None) self.label_list = config.get("label_list", None) self.few_shot_format = config.get("few_shot_format", None) self.text_with_label_format = config.get("text_with_label_format", None) if self.few_shot_num_per_label is None: self.few_shot_num_per_label = config.get("few_shot_num_per_label", 2) def get_generate_prompt(self, tokenize=True, return_tensors="pt"): prompt_list = [apply_template(self.tokenize_format, {"sub_domain": self.sub_domain, "label": label}) for label in self.label_list] if tokenize: tokenized_prompts = self.tokenizer(prompt_list, return_tensors=return_tensors) prompt_list = tokenized_prompts['input_ids'] return {label: prompt for label, prompt in zip(self.label_list, prompt_list)} @staticmethod def construct_prompt_list(samples_dict, num_shot_per_label, prompt_num, format_template, random_state=None): from sklearn.utils import resample from collections import deque label_samples = {label: deque(resample(samples, replace=False, n_samples=len(samples))) for label, samples in samples_dict.items()} def get_samples_for_label(label): samples = [] while len(samples) < num_shot_per_label: remaining_needed = num_shot_per_label - len(samples) if len(label_samples[label]) < remaining_needed: batch_samples = list(label_samples[label]) samples.extend(batch_samples) # reset to allow repetition label_samples[label] = deque(resample(samples_dict[label], replace=False, n_samples=len(samples_dict[label]))) else: batch_samples = [label_samples[label].popleft() for _ in range(remaining_needed)] samples.extend(batch_samples) return samples result = [] for _ in range(prompt_num): prompt = '' for label in samples_dict.keys(): samples = get_samples_for_label(label) for text in samples: prompt += apply_template(format_template, {"text": text, "label": label}) result.append(prompt) return result @staticmethod def group_text_label_list(text_list, label_list): group_data = [{"text": text, "label": label} for text, label in zip(text_list, label_list)] return group_data def prepare_few_shot(self, text_list, label_list, aug_prompt_num): from collections import defaultdict data_dict = defaultdict(list) for text, label in zip(text_list, label_list): # in case extra labels are present, ignore if label in self.label_list: data_dict[label].append(text) few_shot_list = FlexDataset.construct_prompt_list(samples_dict=data_dict, num_shot_per_label=self.few_shot_num_per_label, prompt_num=aug_prompt_num, format_template=self.few_shot_format, random_state=self.random_state) return few_shot_list def prepare_augment(self, text_list, label_list, aug_prompt_num): few_shot_samples = self.prepare_few_shot(text_list, label_list, aug_prompt_num) result = [] instruction = apply_template(self.augment_format, {"sub_domain": self.sub_domain}) for i, sample in enumerate(few_shot_samples): query = instruction + '\n' + sample formatted_query = self.apply_chat_template(query) result.append(formatted_query) return result def abstract_from_augmented(self, sample_list): label_key, text_key = get_jinjax_placeholders(self.few_shot_format, 2) res = {'inputs': [], 'labels': []} for sample in sample_list: data_list = sample.split('\n\n-') for entry in data_list: temp = entry.split(f"<{text_key}>:") # print(f"temp: {temp}") if len(temp) == 2 and f"<{label_key}>" in temp[0]: label_str, input_str = temp label = label_str.split(f"<{label_key}>:")[1].strip() if isinstance(self.label_list[0], int) and label[0].isdigit(): label = int(label[0]) elif isinstance(self.label_list[0], float) and re.match(r'^\d+\.\d*?$', label): label = float(label[0]) # abstracted label value does not match the original label type elif isinstance(self.label_list[0], int) or isinstance(self.label_list[0], float): continue text = input_str.replace('
', '').rstrip('*') text = text.strip() res['inputs'].append(text) res['labels'].append(label) # print(f"res: {res}") return res def prepare_query_to_filter_clustered(self, clustered_sentences_list, clustered_labels_list): prompt_list = [] for clustered_sentences, clustered_labels in zip(clustered_sentences_list, clustered_labels_list): text_with_label = '' for i in range(len(clustered_sentences)): formatted_entry = apply_template(self.text_with_label_format, {"i": i, "text": clustered_sentences[i], "label": clustered_labels[i]}) text_with_label += formatted_entry cluster_query = apply_template(self.filter_format, {"text_with_label": text_with_label}) prompt_list.append(self.apply_chat_template(cluster_query)) return prompt_list def parse_clustered_response(self, clustered_sentence, clustered_labels, response_list): """ Parse the response from the clustering model and filter the data per cluster. :param clustered_sentence: nested list of clustered sentences :param clustered_labels: nested list of clustered labels :param response_list: list of responses from the clustering model """ def parse_response(response): pattern = r'The eligible samples:\s*((?:\b\d+\b[\s.,]*)+)' matches = re.search(pattern, response, re.MULTILINE) if matches: digits = [int(i) for i in re.findall(r'\b\d+\b', matches.group())] else: digits = [] return list(set(digits)) filtered_text_list = [] filtered_label_list = [] for i in range(len(clustered_sentence)): parsed_response = parse_response(response_list[i]) for idx in parsed_response: if idx < len(clustered_sentence[i]): filtered_label_list.append(clustered_labels[i][idx]) filtered_text_list.append(clustered_sentence[i][idx]) return filtered_text_list, filtered_label_list @staticmethod def group_data_list(data_list, text_key, label_key): inputs = [entry[text_key] for entry in data_list] labels = [entry[label_key] for entry in data_list] data_dict = {text_key: inputs, label_key: labels} return data_dict def load(self, path): local_data = load_dataset('json', data_files={self.data_part: path}) self.dataset = local_data if not self.need_preprocess: self.ds = local_data else: tokenized_ds = tokenize_flex_dataset( raw_datasets=local_data, tokenizer=self.tokenizer, sub_domain=self.sub_domain, tokenize_format=self.tokenize_format, text_key=self.text_key, label_key=self.label_key, max_prompt_len=self.max_prompt_len ) self.ds = tokenized_ds[self.data_part] if self.select_num is not None: self.ds = self.ds.select(range(self.select_num)) def apply_chat_template(self, query): tokenizer = self.tokenizer if "llama-3" in self.tokenizer_name_or_path.lower(): msg = [ {"role": "system", "content": "You are a helpful assistant. "}, {"role": "user", "content": query} ] prompt = tokenizer.apply_chat_template(msg, add_generation_prompt=True, tokenize=False) else: conv = get_conversation_template(self.tokenizer_name_or_path) conv.append_message(conv.roles[0], query) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() return prompt def get_raw_dataset(self): return self.dataset def __len__(self): return len(self.ds) def get_item(self, i): return self.dataset[self.data_part][i] def get_item_dict(self, i): return {"text": self.dataset[self.data_part][self.text_key][i], "label": self.dataset[self.data_part][self.label_key][i]} def __getitem__(self, i) -> dict: return self.ds[i] ================================================ FILE: python/fate_llm/dataset/hf_dataset.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os from typing import Optional, Union, Sequence, Mapping, Dict from datasets import load_dataset, Features, Split, DownloadConfig, DownloadMode, VerificationMode, Version, load_from_disk from transformers import AutoTokenizer from fate.ml.nn.dataset.base import Dataset # avoid tokenizer parallelism os.environ["TOKENIZERS_PARALLELISM"] = "false" class HuggingfaceDataset(Dataset): """ A dataset class for huggingface datasets """ def __init__( self, name: Optional[str] = None, data_dir: Optional[str] = None, data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, split: Optional[Union[str, Split]] = None, cache_dir: Optional[str] = None, features: Optional[Features] = None, download_config: Optional[DownloadConfig] = None, download_mode: Optional[Union[DownloadMode, str]] = None, verification_mode: Optional[Union[VerificationMode, str]] = None, ignore_verifications="deprecated", keep_in_memory: Optional[bool] = None, save_infos: bool = False, revision: Optional[Union[str, Version]] = None, token: Optional[Union[bool, str]] = None, use_auth_token="deprecated", task="deprecated", streaming: bool = False, num_proc: Optional[int] = None, storage_options: Optional[Dict] = None, trust_remote_code: bool = None, tokenizer_params: Optional[Dict] = None, tokenizer_apply_params: Optional[Dict] = None, load_from_disk: Optional[bool] = False, inplace_load: Optional[bool] = True, data_split_key: Optional[str] = None, **config_kwargs, ): self.name = name self.data_dir = data_dir self.data_files = data_files self.split = split self.cache_dir = cache_dir self.features = features self.download_config = download_config self.download_mode = download_mode self.verification_mode = verification_mode self.ignore_verifications = ignore_verifications self.keep_in_memory = keep_in_memory self.save_infos = save_infos self.revision = revision self.token = token self.use_auth_token = use_auth_token self.task = task self.streaming = streaming self.num_proc = num_proc self.storage_options = storage_options self.trust_remote_code = trust_remote_code self.tokenizer_params = tokenizer_params self.tokenizer_apply_params = tokenizer_apply_params self.config_kwargs = config_kwargs self.load_from_disk = load_from_disk self.inplace_load = inplace_load self.data_split_key = data_split_key self.ds = None super(HuggingfaceDataset, self).__init__() def load(self, file_path): if not self.load_from_disk: ds = load_dataset(path=file_path, name=self.name, data_dir=self.data_dir, data_files=self.data_files, split=self.split, cache_dir=self.cache_dir, features=self.features, download_config=self.download_config, download_mode=self.download_mode, verification_mode=self.verification_mode, ignore_verifications=self.ignore_verifications, keep_in_memory=self.keep_in_memory, save_infos=self.save_infos, revision=self.revision, token=self.token, use_auth_token=self.use_auth_token, task=self.task, streaming=self.streaming, num_proc=self.num_proc, storage_options=self.storage_options, trust_remote_code=self.trust_remote_code, **self.config_kwargs) else: ds = load_from_disk(file_path) if self.data_split_key is not None: ds = ds[self.data_split_key] if self.inplace_load: self.ds = ds else: return ds def __getitem__(self, idx): if self.ds is None: raise ValueError('Dataset is not loaded') return self.ds[idx] def __len__(self): if self.ds is None: raise ValueError('Dataset is not loaded') return len(self.ds) class Dolly15K(HuggingfaceDataset): INSTRUCTION_KEY = "### Instruction:" INPUT_KEY = "Input:" RESPONSE_KEY = "### Response:" END_KEY = "### End" RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" DEFAULT_SEED = 42 INTRO_BLURB = ( "Below is an instruction that describes a task. Write a response that appropriately completes the request." ) PROMPT_NO_INPUT_FORMAT = """{intro} {instruction_key} {instruction} {response_key} {response} {end_key}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY, response="{response}", end_key=END_KEY, ) # This is a training prompt that contains an input string that serves as context for the instruction. For example, # the input might be a passage from Wikipedia and the intruction is to extract some information from it. PROMPT_WITH_INPUT_FORMAT = """{intro} {instruction_key} {instruction} {input_key} {input} {response_key} {response} {end_key}""".format( intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", input_key=INPUT_KEY, input="{input}", response_key=RESPONSE_KEY, response="{response}", end_key=END_KEY, ) def __init__(self, *args, **kwargs): super(Dolly15K, self).__init__(*args, **kwargs) self.inplace_load = False def load(self, file_path): dataset = super().load(file_path) return self._post_process(dataset) def _post_process(self, dataset): def _add_text(rec): instruction = rec["instruction"] response = rec["response"] context = rec.get("context") if not instruction: raise ValueError(f"Expected an instruction in: {rec}") if not response: raise ValueError(f"Expected a response in: {rec}") # For some instructions there is an input that goes along with the instruction, providing context for the # instruction. For example, the input might be a passage from Wikipedia and the instruction says to extract # some piece of information from it. The response is that information to extract. In other cases there is # no input. For example, the instruction might be open QA such as asking what year some historic figure was # born. if context: rec["text"] = self.PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, response=response, input=context) else: rec["text"] = self.PROMPT_NO_INPUT_FORMAT.format(instruction=instruction, response=response) return rec dataset = dataset.map(_add_text) tokenizer = AutoTokenizer.from_pretrained(**self.tokenizer_params) def tokenize_function(examples): return tokenizer(examples["text"], **self.tokenizer_apply_params) dataset = dataset.map(tokenize_function, batched=True) return dataset ================================================ FILE: python/fate_llm/dataset/input_output_dataset.py ================================================ from fate.ml.nn.dataset.base import Dataset from transformers.trainer_pt_utils import LabelSmoother from typing import List, Dict, Union, Literal import logging from jinja2 import Template from transformers import AutoTokenizer logger = logging.getLogger(__name__) class InputOutputDataset(Dataset): def __init__(self, tokenizer_path, input_template: str, output_template: str, max_input_length: int = 256, max_target_length: int = 256, load_from: Literal['jsonl', 'hf_load_from_disk', 'hf_load_dataset'] = 'hf_load_from_disk', split_key: str = None ): super().__init__() self.tokenizer = None self.tokenizer_path = tokenizer_path self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path, trust_remote_code=True) self.max_input_length = max_input_length self.max_target_length = max_target_length self.dataset = None self.load_from = load_from self.input_template = Template(input_template) self.output_template = Template(output_template) self.split_key = split_key self.max_seq_length = max_input_length + max_target_length + 1 def load(self, path): if self.load_from == 'hf_load_from_disk': import datasets self.dataset = datasets.load_from_disk(path) if self.split_key is not None: self.dataset = self.dataset[self.split_key] self.dataset = [i for i in self.dataset] elif self.load_from == 'jsonl': import json with open(path, 'r') as f: json_lines = f.read().split('\n') self.dataset = [] for i in json_lines: try: self.dataset.append(json.loads(i)) except: print('skip line') elif self.load_from == 'hf_load_dataset': from datasets import load_dataset self.dataset = load_dataset(path) if self.split_key is not None: self.dataset = self.dataset[self.split_key] self.dataset = [i for i in self.dataset] else: raise ValueError('unknown load format') if not isinstance(self.dataset, list) or not isinstance(self.dataset[0], dict): logger.warn('loaded dataset is expected to be a list of dict') def get_raw_dataset(self): return self.dataset def __len__(self): return len(self.dataset) def get_str_item(self, i) -> dict: data_item = self.dataset[i] in_ = self.input_template.render(**data_item) out_ = self.output_template.render(**data_item) return { 'input': in_, 'output': out_ } def _process_item(self, data_item): a_ids = self.tokenizer.encode(text=data_item['input'], add_special_tokens=True, truncation=True, max_length=self.max_input_length) b_ids = self.tokenizer.encode(text=data_item['output'], add_special_tokens=False, truncation=True, max_length=self.max_target_length) context_length = len(a_ids) input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id] labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id] pad_len = self.max_seq_length - len(input_ids) input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len labels = labels + [self.tokenizer.pad_token_id] * pad_len labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels] assert len(input_ids) == len(labels), f"length mismatch: {len(input_ids)} vs {len(labels)}" return { "input_ids": input_ids, "labels": labels } def get_tokenized_item(self, i) -> dict: str_item = self.get_str_item(i) ret_dict = self._process_item(str_item) return ret_dict def __getitem__(self, i) -> dict: item = self.get_tokenized_item(i) return item ================================================ FILE: python/fate_llm/dataset/prompt_dataset.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import copy import json import datasets import torch from fate.ml.nn.dataset.base import Dataset from ..data.tokenizers.cust_tokenizer import get_tokenizer PROMPT_TEMPLATE = "{prompt}" class PromptDataset(Dataset): def __init__(self, text_max_length=512, tokenizer_name_or_path=None, trust_remote_code=False, padding=False, padding_side='left', pad_token=None, pad_token_id=None, bos_token_id=None, eos_token_id=None, add_eos_token=True, prompt_template=None, add_special_tokens=False, prompt_column="content", response_column="summary", max_prompt_length=256, file_type="jsonl", num_proc=4, ): super(PromptDataset, self).__init__() self.tokenizer = None self.tokenizer_name_or_path = tokenizer_name_or_path self.padding = padding self.add_special_tokens = add_special_tokens self.max_prompt_length = max_prompt_length self.text_max_length = text_max_length self.tokenizer = get_tokenizer( tokenizer_name_or_path=tokenizer_name_or_path, trust_remote_code=trust_remote_code, pad_token=pad_token, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, padding_side=padding_side, add_eos_token=add_eos_token, ) self.prompt_template = prompt_template if prompt_template else PROMPT_TEMPLATE self.prompt_column = prompt_column self.response_column = response_column self.file_type = file_type self.num_proc = num_proc self._data = None def load(self, file_path): if "jsonl" in self.file_type: prompts = [] responses = [] with open(file_path, "r") as fin: for line in fin: line = json.loads(line) prompts.append(line[self.prompt_column]) responses.append(line[self.response_column]) ds = datasets.Dataset.from_dict({self.prompt_column: prompts, self.response_column: responses}) else: ds = datasets.load_from_disk(file_path) self._data = ds.map( self._process_data, fn_kwargs={"tokenizer": self.tokenizer, "prompt_template": self.prompt_template, "prompt_column": self.prompt_column, "response_column": self.response_column, "max_prompt_length": self.max_prompt_length, "max_length": self.text_max_length }, batched=True, remove_columns=ds.column_names, num_proc=self.num_proc, ) max_length = None for d in self._data: if max_length is None: max_length = len(d["input_ids"]) else: max_length = max(max_length, len(d["input_ids"])) self._data = self._data.map( self._pad_to_max_length, batched=True, fn_kwargs={ "tokenizer": self.tokenizer, "max_length": max_length }, num_proc=self.num_proc ) @staticmethod def _process_data(examples, tokenizer, prompt_template, prompt_column, response_column, max_prompt_length, max_length): prompts = examples[prompt_column] responses = examples[response_column] processed_data = dict() input_ids_list = [] labels_list = [] attention_mask_list = [] for _prompt, _response in zip(prompts, responses): if isinstance(_response, list): _response = _response[0] _prompt = prompt_template.format_map(dict(prompt=_prompt)) prompt_encoded = tokenizer(_prompt) if len(prompt_encoded['input_ids']) > 0 and prompt_encoded['input_ids'][-1] in tokenizer.all_special_ids: prompt_encoded['input_ids'] = prompt_encoded['input_ids'][:-1] prompt_encoded['attention_mask'] = prompt_encoded['attention_mask'][:-1] target_encoded = tokenizer(_response) if len(target_encoded['input_ids']) > 0 and target_encoded['input_ids'][-1] in tokenizer.all_special_ids: target_encoded['input_ids'] = target_encoded['input_ids'][:-1] target_encoded['attention_mask'] = target_encoded['attention_mask'][:-1] prompt_ids = prompt_encoded["input_ids"][: max_prompt_length] prompt_attention_mask = prompt_encoded["attention_mask"][:max_prompt_length] target_ids = target_encoded["input_ids"][: max_length - len(prompt_ids) - 1] target_attention_mask = target_encoded["attention_mask"][: max_length - len(prompt_ids) - 1] if tokenizer.bos_token_id is not None: seq_length = len(prompt_ids) + 1 input_ids = prompt_ids + [tokenizer.bos_token_id] + target_ids + [tokenizer.eos_token_id] labels = [-100] * seq_length + input_ids[seq_length:] attention_mask = prompt_attention_mask + [1] + target_attention_mask + [1] else: seq_length = len(prompt_ids) input_ids = prompt_ids + target_ids + [tokenizer.eos_token_id] labels = [-100] * seq_length + input_ids[seq_length:] attention_mask = prompt_attention_mask + target_attention_mask + [1] input_ids_list.append(input_ids) labels_list.append(labels) attention_mask_list.append(attention_mask) processed_data["labels"] = labels_list processed_data["input_ids"] = input_ids_list processed_data["attention_mask"] = attention_mask_list return processed_data @staticmethod def _pad_to_max_length(examples, tokenizer, max_length): padded_input_ids = [] padded_labels = [] padded_attention_mask = [] labels_list = examples["labels"] input_ids_list = examples["input_ids"] attention_mask_list = examples["attention_mask"] for input_ids, attention_mask, labels in zip(input_ids_list, attention_mask_list, labels_list): l = len(input_ids) input_ids = torch.LongTensor(input_ids + [tokenizer.pad_token_id] * (max_length - l)) labels = torch.LongTensor(labels + [-100] * (max_length - l)) attention_mask = torch.LongTensor(attention_mask + [0] * (max_length - l)) padded_input_ids.append(input_ids) padded_labels.append(labels) padded_attention_mask.append(attention_mask) return dict( input_ids=padded_input_ids, attention_mask=padded_attention_mask, labels=padded_labels ) def get_vocab_size(self): return self.tokenizer.vocab_size def __getitem__(self, item): return self._data[item] def __len__(self): return len(self._data) def __repr__(self): return self.tokenizer.__repr__() ================================================ FILE: python/fate_llm/dataset/qa_dataset.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datasets import load_from_disk, load_dataset from transformers import AutoTokenizer from fate.ml.nn.dataset.base import Dataset """ These Data pre-processing templates are from https://github.com/mit-han-lab/offsite-tuning """ class PIQA: def __init__(self): self._template = "Question: {}\nAnswer:" def get_context(self, examples): ctx = examples['goal'] return [self._template.format(c) for c in ctx] def get_target(self, examples): if -1 in examples["label"]: # test set return [""] * len(examples["label"]) else: gt_tuples = [("sol{}".format(label + 1), idx) for idx, label in enumerate(examples['label'])] return [examples[k][i] for k, i in gt_tuples] class SciQ: def __init__(self): self._template = "{}\nQuestion: {}\nAnswer:" def get_context(self, examples): sources = examples['support'] queries = examples['question'] return [self._template.format(s, q) for s, q in zip(sources, queries)] def get_target(self, examples): return examples['correct_answer'] class OpenBookQA: def get_context(self, examples): return examples['question_stem'] def get_target(self, examples): choices = examples['choices'] answers = examples['answerKey'] targets = [] for choice, answer in zip(choices, answers): answer = ord(answer.strip()) - ord('A') targets.append(choice['text'][answer]) return targets class ARC: def __init__(self): self._template = "Question: {}\nAnswer:" def get_context(self, examples): ctx = examples['question'] return [self._template.format(c) for c in ctx] def get_target(self, examples): choices = examples['choices'] answers = examples['answerKey'] num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"} for idx, answer in enumerate(answers): answer = num_to_letter.get(answer, answer) answer = ord(answer) - ord("A") answers[idx] = choices[idx]["text"][answer] return answers class WIC: def __init__(self): self._template = "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \ " two sentences above?\nAnswer:" def get_context(self, examples): sentences_1 = examples["sentence1"] sentences_2 = examples["sentence2"] starts_1 = examples["start1"] ends_1 = examples["end1"] contexts = [] for s1, s2, st, ed in zip(sentences_1, sentences_2, starts_1, ends_1): contexts.append( self._template.format(s1, s2, s1[st: ed]) ) return contexts def get_target(self, examples): labels = examples["label"] targets = [] for label in labels: targets.append(" {}".format({0: "no", 1: "yes"}[label])) return targets class BoolQ: def __init__(self): self._template = "{}\nQuestion: {}?\nAnswer:" def get_context(self, examples): passages = examples["passage"] questions = examples["question"] return [self._template.format(passage, question) for passage, question in zip(passages, questions) ] def get_target(self, examples): return [" " + "yes" if label else "no" for label in examples["answer"]] class CommonsenseQA: def get_context(self, examples): return examples["question"] def get_target(self, examples): choices = examples['choices'] answers = examples['answerKey'] targets = [] for choice, answer in zip(choices, answers): answer = ord(answer.strip()) - ord('A') targets.append(choice['text'][answer]) return targets class RTE: def __init__(self): self._template = "{}\nQuestion: {} True or False?\nAnswer:" def get_context(self, examples): sentences_1 = examples["premise"] sentences_2 = examples["hypothesis"] contexts = [] for sentence_1, sentence_2 in zip(sentences_1, sentences_2): contexts.append( self._template.format(sentence_1, sentence_2) ) return contexts def get_target(self, examples): labels = examples["label"] return [" {}".format({0: "True", 1: "False"}[label]) for label in labels] task_dict = { "piqa": PIQA(), "sciq": SciQ(), "openbookqa": OpenBookQA(), "arc_easy": ARC(), "arc_challenge": ARC(), "wic": WIC(), "boolq": BoolQ(), "commonsenseqa": CommonsenseQA(), "rte": RTE() } def tokenize_qa_dataset(dataset_name, tokenizer, save_path=None, seq_max_len=1000, data_part="train", dataset=None): max_len = seq_max_len assert dataset_name in task_dict.keys(), f"dataset name must be one of {list(task_dict.keys())}" if dataset is None: raw_datasets = load_dataset(dataset_name) else: raw_datasets = dataset task = task_dict[dataset_name] column_names = raw_datasets[data_part].column_names def tokenize_function(examples): context = task.get_context(examples) target = task.get_target(examples) context = tokenizer(context) target = tokenizer(target) # if context is ending with special token, remove it if len(context['input_ids'][0]) > 0 and context['input_ids'][0][-1] in tokenizer.all_special_ids: context['input_ids'] = [i[:-1] for i in context['input_ids']] context['attention_mask'] = [a[:-1] for a in context['attention_mask']] # if target is starting with special token, remove it if len(target['input_ids'][0]) > 0 and target['input_ids'][0][0] in tokenizer.all_special_ids: target['input_ids'] = [i[1:] for i in target['input_ids']] target['attention_mask'] = [a[1:] for a in target['attention_mask']] out = {} out['input_ids'] = [i1 + i2 for i1, i2 in zip(context['input_ids'], target['input_ids'])] out['attention_mask'] = [a1 + a2 for a1, a2 in zip(context['attention_mask'], target['attention_mask'])] # set -100 for context tokens out["labels"] = [ [-100] * len(i1) + i2 for i1, i2 in zip(context['input_ids'], target['input_ids'])] return out tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=4, remove_columns=column_names, load_from_cache_file=True, desc="Running tokenizer on dataset", ) # pad all instances in lm_datasets to the max length of the dataset max_length = -1 for v in tokenized_datasets.values(): for x in v: max_length = max(max_length, len(x['input_ids'])) # pad to the multiple of 8 max_length = (max_length // 8 + 1) * 8 block_size = max_len max_length = min(max_length, block_size) def pad_function(examples): examples["input_ids"] = [i + [tokenizer.pad_token_id] * (max_length - len(i)) for i in examples["input_ids"]] examples["attention_mask"] = [[1] * len(i) + [0] * (max_length - len(i)) for i in examples["attention_mask"]] examples["labels"] = [i + [-100] * (max_length - len(i)) for i in examples["labels"]] # truncate to max_length examples["input_ids"] = [i[:max_length] for i in examples["input_ids"]] examples["attention_mask"] = [a[:max_length] for a in examples["attention_mask"]] examples["labels"] = [l[:max_length] for l in examples["labels"]] return examples tokenized_datasets = tokenized_datasets.map( pad_function, batched=True, num_proc=4, load_from_cache_file=True, desc=f"Padding dataset to max length {max_length}", ) if save_path is not None: tokenized_datasets.save_to_disk(save_path) return tokenized_datasets class QaDataset(Dataset): def __init__(self, tokenizer_name_or_path, select_num=None, start_idx=None, need_preprocess=False, dataset_name=None, data_part="train", seq_max_len=1000 ): self.select_num = select_num self.start_idx = start_idx self.ds = None self.need_preprocess = need_preprocess self.dataset_name = dataset_name self.data_part = data_part self.seq_max_len = seq_max_len self.return_with_idx = False if 'llama' in tokenizer_name_or_path.lower(): self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, unk_token="", bos_token="", eos_token="", add_eos_token=True) self.tokenizer.pad_token = self.tokenizer.eos_token else: self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) if 'gpt2' in tokenizer_name_or_path.lower(): self.tokenizer.pad_token = self.tokenizer.eos_token def load(self, path): local_data = load_from_disk(path) if not self.need_preprocess: self.ds = local_data[self.data_part] else: tokenized_ds = tokenize_qa_dataset( dataset_name=self.dataset_name, tokenizer=self.tokenizer, seq_max_len=self.seq_max_len, data_part=self.data_part, dataset=local_data ) self.ds = tokenized_ds[self.data_part] if self.select_num is not None: if self.start_idx is not None: self.ds = self.ds.select(range(self.start_idx, min(len(self.ds), self.start_idx + self.select_num))) else: self.ds = self.ds.select(range(self.select_num)) def set_return_with_idx(self): self.return_with_idx = True def reset_return_with_idx(self): self.return_with_idx = False def __len__(self): return len(self.ds) def __getitem__(self, idx): if self.return_with_idx: return { "idx": idx, "inputs": self.ds[idx] } else: return self.ds[idx] ================================================ FILE: python/fate_llm/dataset/seq_cls_dataset.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate.ml.nn.dataset.base import Dataset import pandas as pd import torch as t from transformers import AutoTokenizer import os import numpy as np # avoid tokenizer parallelism os.environ["TOKENIZERS_PARALLELISM"] = "false" class SeqCLSDataset(Dataset): """ A Dataset for some basic NLP Tasks, this dataset will automatically transform raw text into word indices using AutoTokenizer from transformers library, Parameters ---------- truncation bool, truncate word sequence to 'text_max_length' text_max_length int, max length of word sequences tokenizer_name_or_path str, name of bert tokenizer(see transformers official for details) or path to local transformer tokenizer folder return_label bool, return label or not, this option is for host dataset, when running hetero-NN padding bool, whether to pad the word sequence to 'text_max_length' padding_side str, 'left' or 'right', where to pad the word sequence pad_token str, pad token, use this str as pad token, if None, use tokenizer.pad_token return_input_ids bool, whether to return input_ids or not, if False, return word_idx['input_ids'] """ def __init__( self, truncation=True, text_max_length=128, tokenizer_name_or_path="bert-base-uncased", return_label=True, padding=True, padding_side="right", pad_token=None, return_input_ids=True): super(SeqCLSDataset, self).__init__() self.text = None self.word_idx = None self.label = None self.tokenizer = None self.sample_ids = None self.padding = padding self.truncation = truncation self.max_length = text_max_length self.with_label = return_label self.tokenizer_name_or_path = tokenizer_name_or_path self.tokenizer = AutoTokenizer.from_pretrained( self.tokenizer_name_or_path) self.tokenizer.padding_side = padding_side self.return_input_ids = return_input_ids if pad_token is not None: self.tokenizer.add_special_tokens({'pad_token': pad_token}) def load(self, file_path): tokenizer = self.tokenizer self.text = pd.read_csv(file_path) text_list = list(self.text.text) self.word_idx = tokenizer( text_list, padding=self.padding, return_tensors='pt', truncation=self.truncation, max_length=self.max_length) if self.return_input_ids: self.word_idx = self.word_idx['input_ids'] if self.with_label: self.label = t.Tensor(self.text.label).detach().numpy() self.label = self.label.reshape((len(self.text), -1)) if 'id' in self.text: self.sample_ids = self.text['id'].values.tolist() def get_classes(self): return np.unique(self.label).tolist() def get_vocab_size(self): return self.tokenizer.vocab_size def get_sample_ids(self): return self.sample_ids def __getitem__(self, item): if self.return_input_ids: ret = self.word_idx[item] else: ret = {k: v[item] for k, v in self.word_idx.items()} if self.with_label: return ret, self.label[item] return ret def __len__(self): return len(self.text) def __repr__(self): return self.tokenizer.__repr__() ================================================ FILE: python/fate_llm/evaluate/__init__.py ================================================ ================================================ FILE: python/fate_llm/evaluate/scripts/__init__.py ================================================ ================================================ FILE: python/fate_llm/evaluate/scripts/_options.py ================================================ import time import click from ..utils.config import parse_config, default_eval_config from ..utils.config import _set_namespace def parse_custom_type(value): parts = value.split('=') if len(parts) == 2 and parts[1].isdigit(): return parts[0], int(parts[1]) elif len(parts) == 2 and isinstance(parts[1], str): return parts[0], parts[1] else: raise click.BadParameter('Invalid input format. Use "str=int" or "str=str".') class LlmSharedOptions(object): _options = { "eval_config": (('-c', '--eval_config'), dict(type=click.Path(exists=True), help=f"Manual specify config file", default=None), default_eval_config().__str__()), "yes": (('-y', '--yes',), dict(type=bool, is_flag=True, help="Skip double check", default=None), False), "namespace": (('-n', '--namespace'), dict(type=str, help=f"Manual specify fate llm namespace", default=None), time.strftime('%Y%m%d%H%M%S')) } def __init__(self): self._options_kwargs = {} def __getitem__(self, item): return self._options_kwargs[item] def get(self, k, default=None): v = self._options_kwargs.get(k, default) if v is None and k in self._options: v = self._options[k][2] return v def update(self, **kwargs): for k, v in kwargs.items(): if v is not None: self._options_kwargs[k] = v def post_process(self): # add defaults here for k, v in self._options.items(): if self._options_kwargs.get(k, None) is None: self._options_kwargs[k] = v[2] # update config config = parse_config(self._options_kwargs['eval_config']) self._options_kwargs['eval_config'] = config _set_namespace(self._options_kwargs['namespace']) @classmethod def get_shared_options(cls, hidden=False): def shared_options(f): for name, option in cls._options.items(): f = click.option(*option[0], **dict(option[1], hidden=hidden))(f) return f return shared_options ================================================ FILE: python/fate_llm/evaluate/scripts/config_cli.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import click import yaml from pathlib import Path from ..utils.config import create_eval_config, default_eval_config from ._options import LlmSharedOptions from ..utils._io import echo @click.group("eval_config", help="fate_llm evaluate config") def eval_config_group(): """ eval_config fate_llm """ pass @eval_config_group.command(name="new") def _new(): """ create new fate_llm eval config from template """ create_eval_config(Path("llm_eval_config.yaml")) click.echo(f"create eval_config file: llm_eval_config.yaml") @eval_config_group.command(name="edit") @LlmSharedOptions.get_shared_options(hidden=True) @click.pass_context def _edit(ctx, **kwargs): """ edit fate_llm eval_config file """ ctx.obj.update(**kwargs) eval_config = ctx.obj.get("eval_config") print(f"eval_config: {eval_config}") click.edit(filename=eval_config) @eval_config_group.command(name="show") def _show(): """ show fate_test default eval_config path """ click.echo(f"default eval_config path is {default_eval_config()}") ================================================ FILE: python/fate_llm/evaluate/scripts/data_cli.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import copy import click import yaml import warnings from typing import Union from ._options import LlmSharedOptions from ..utils.llm_evaluator import download_task from ..utils._io import echo @click.command('download_data') @click.option('-t', '--tasks', required=False, type=str, multiple=True, default=None, help='tasks whose data will be downloaded') # @click.argument('other_args', nargs=-1) @LlmSharedOptions.get_shared_options(hidden=True) @click.pass_context def download_data(ctx, tasks, **kwargs): """ Evaluate a pretrained model with specified parameters. """ ctx.obj.update(**kwargs) ctx.obj.post_process() if tasks is None or len(tasks) == 0: tasks = None echo.echo(f"No task is given, will download data for all built-in tasks.", fg='red') else: echo.echo(f"given tasks: {tasks}", fg='red') download_task(tasks) ================================================ FILE: python/fate_llm/evaluate/scripts/eval_cli.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import copy import click import yaml import warnings from typing import Union from ._options import LlmSharedOptions from ..utils.config import default_eval_config from ..utils.llm_evaluator import evaluate, init_tasks, aggregate_table from ..utils.model_tools import load_by_loader from ..utils._io import echo from ..utils._parser import LlmSuite @click.command('evaluate') @click.option('-i', '--include', required=True, type=click.Path(exists=True), help='Path to model and metrics conf') @click.option('-c', '--eval-config', type=click.Path(exists=True), help='Path to FATE Llm evaluation config. ' 'If not provided, use default config.') @click.option('-o', '--result-output', type=click.Path(), help='Path to save evaluation results.') # @click.argument('other_args', nargs=-1) @LlmSharedOptions.get_shared_options(hidden=True) @click.pass_context def run_evaluate(ctx, include, eval_config, result_output, **kwargs): """ Evaluate a pretrained model with specified parameters. """ ctx.obj.update(**kwargs) ctx.obj.post_process() # namespace = ctx.obj["namespace"] yes = ctx.obj["yes"] echo.echo(f"include: {include}", fg='red') try: # include = os.path.abspath(include) suite = LlmSuite.load(include) except Exception as e: raise ValueError(f"Invalid include path: {include}, please check. {e}") if not eval_config: eval_config = default_eval_config() if not os.path.exists(eval_config): eval_config = None if not yes and not click.confirm("running?"): return # init tasks init_tasks() # run_suite_eval(suite, eval_config_dict, result_output) run_suite_eval(suite, eval_config, result_output) def run_job_eval(job, eval_conf): job_eval_conf = {} if isinstance(eval_conf, dict): job_eval_conf.update(eval_conf) elif eval_conf is not None and os.path.exists(eval_conf): with open(eval_conf, 'r') as f: job_eval_conf.update(yaml.safe_load(f)) # echo.echo(f"Evaluating job: {job.job_name} with tasks: {job.tasks}") if job.eval_conf_path: # job-level eval conf takes priority with open(job.eval_conf_path, 'r') as f: job_eval_conf.update(yaml.safe_load(f)) # get loader if job.loader: if job.peft_path: model = load_by_loader(loader_name=job.loader, loader_conf_path=loader_conf_path, peft_path=job.peft_path) else: model = load_by_loader(loader_name=job.loader, loader_conf_path=loader_conf_path) result = evaluate(model=model, tasks=job.tasks, include_path=job.include_path, **job_eval_conf) else: # feed in pretrained & peft path job_eval_conf["model_args"]["pretrained"] = job.pretrained_model_path if job.peft_path: job_eval_conf["model_args"]["peft"] = job.peft_path result = evaluate(tasks=job.tasks, include_path=job.include_path, **job_eval_conf) return result def run_suite_eval(suite, eval_conf, output_path=None): suite_results = dict() for pair in suite.pairs: job_results = dict() for job in pair.jobs: if not job.evaluate_only: # give warning that job will be skipped warnings.warn(f"Job {job.job_name} will be skipped since no pretrained model is provided") continue echo.echo(f"Evaluating job: {job.job_name} with tasks: {job.tasks}") result = run_job_eval(job, eval_conf) job_results[job.job_name] = result suite_results[pair.pair_name] = job_results suite_writers = aggregate_table(suite_results) for pair_name, pair_writer in suite_writers.items(): echo.sep_line() echo.echo(f"Pair: {pair_name}") echo.sep_line() echo.echo(pair_writer.dumps()) echo.stdout_newline() if output_path: with open(output_path, 'w') as f: for pair_name, pair_writer in suite_writers.items(): pair_writer.dumps(f) ================================================ FILE: python/fate_llm/evaluate/scripts/fate_llm_cli.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import click import yaml from typing import Union from .eval_cli import run_evaluate from .config_cli import eval_config_group from .data_cli import download_data from ._options import LlmSharedOptions commands = { "evaluate": run_evaluate, "config": eval_config_group, "download": download_data } class FATELlmCLI(click.MultiCommand): def list_commands(self, ctx): return list(commands) def get_command(self, ctx, name): if name not in commands and name in commands_alias: name = commands_alias[name] if name not in commands: ctx.fail("No such command '{}'.".format(name)) return commands[name] @click.command(cls=FATELlmCLI, help="A collection of tools to run FATE Llm Evaluation.", context_settings=dict(help_option_names=["-h", "--help"])) @LlmSharedOptions.get_shared_options() @click.pass_context def fate_llm_cli(ctx, **kwargs): ctx.ensure_object(LlmSharedOptions) ctx.obj.update(**kwargs) if __name__ == '__main__': fate_llm_cli(obj=LlmSharedOptions()) ================================================ FILE: python/fate_llm/evaluate/tasks/__init__.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import yaml import os def local_fn_constructor(loader, node): return node def local_fn_representer(dumper, data): return data def dump_yaml(dict, path): yaml.add_representer(yaml.ScalarNode, local_fn_representer) with open(path, 'w') as f: yaml.dump(dict, f) class Task: _task_name = "" _task_dir = "" _task_conf_file = "" _task_source_url = "" script_dir = os.path.dirname(__file__) @property def task_name(self): return self._task_name @property def task_template(self): yaml.add_constructor("!function", local_fn_constructor) with open(os.path.abspath(os.path.join(self.script_dir, self._task_dir, self._task_conf_file)), "rb") as f: task_template = yaml.full_load(f) return task_template @property def task_scr_dir(self): return os.path.abspath(os.path.join(self.script_dir, self._task_dir)) @property def task_conf_path(self): return os.path.abspath(os.path.join(self.script_dir, self._task_dir, self._task_conf_file)) @property def task_source_url(self): return self._task_source_url def download_from_source(self): raise NotImplementedError(f"Should not be called here.") class Dolly(Task): _task_name = "dolly-15k" _task_dir = "dolly_15k" _task_conf_file = "default_dolly_15k.yaml" def download_from_source(self): try: from datasets import load_dataset data = load_dataset("databricks/databricks-dolly-15k", split="train") filename = os.path.join(self.task_scr_dir, "databricks-dolly-15k.jsonl") data.to_json(filename) return True except Exception as e: print(f"Failed to download data from source: {e}") return False class AdvertiseGen(Task): _task_name = "advertise-gen" _task_dir = "advertise_gen" _task_conf_file = "default_advertise_gen.yaml" _task_source_url = ["https://cloud.tsinghua.edu.cn/seafhttp/files/3781289a-5a60-44b1-b5f1-a04364e3eb9d/AdvertiseGen.tar.gz", "https://docs.google.com/uc?export=download&id=13_vf0xRTQsyneRKdD1bZIr93vBGOczrk"] def download_from_source(self): from ..utils.data_tools import download_data result = download_data(self.task_scr_dir, self.task_source_url[0]) if not result: print(f"retry with address: {self.task_source_url[1]}") return download_data(self.task_scr_dir, self.task_source_url[1]) return result build_in_tasks = {"dolly-15k": Dolly(), "advertise-gen": AdvertiseGen()} ================================================ FILE: python/fate_llm/evaluate/tasks/advertise_gen/__init__.py ================================================ ================================================ FILE: python/fate_llm/evaluate/tasks/advertise_gen/advertise_utils.py ================================================ # adopted from https://github.com/huggingface/datasets/blob/main/metrics/rouge/rouge.py from rouge_score import rouge_scorer # from multiprocessing import Pool def rouge_l(predictions, references, use_stemmer=False): scorer = rouge_scorer.RougeScorer(rouge_types=['rougeL'], use_stemmer=use_stemmer) scores = [] for ref, pred in zip(references, predictions): score = scorer.score(ref, pred) scores.append(score) rouge_l_score = scores[0]['rougeL'].fmeasure return rouge_l_score ================================================ FILE: python/fate_llm/evaluate/tasks/advertise_gen/default_advertise_gen.yaml ================================================ dataset_kwargs: data_files: train: train.json validation: dev.json dataset_path: json doc_to_target: '{{summary}}' doc_to_text: '{{content}}' metric_list: - aggregation: mean higher_is_better: true metric: !function 'advertise_utils.rouge_l' output_type: generate_until task: advertise-gen validation_split: validation ================================================ FILE: python/fate_llm/evaluate/tasks/dolly_15k/__init__.py ================================================ ================================================ FILE: python/fate_llm/evaluate/tasks/dolly_15k/default_dolly_15k.yaml ================================================ dataset_kwargs: data_files: databricks-dolly-15k.jsonl dataset_path: json doc_to_target: '{{response}}' doc_to_text: !function 'dolly_utils.doc_to_text' metric_list: - aggregation: mean higher_is_better: true metric: !function 'dolly_utils.rouge_l' output_type: generate_until task: dolly-15k validation_split: train ================================================ FILE: python/fate_llm/evaluate/tasks/dolly_15k/dolly_utils.py ================================================ # adopted from https://github.com/huggingface/datasets/blob/main/metrics/rouge/rouge.py from rouge_score import rouge_scorer def rouge_l(predictions, references, use_stemmer=False): scorer = rouge_scorer.RougeScorer(rouge_types=['rougeL'], use_stemmer=use_stemmer) scores = [] for ref, pred in zip(references, predictions): score = scorer.score(ref, pred) scores.append(score) rouge_l_score = scores[0]['rougeL'].fmeasure return rouge_l_score def doc_to_text(doc): if doc["context"]: return f"context: {doc['context']}\ninstruction: {doc['instruction']}\nresponse:" else: return f"instruction: {doc['instruction']}\nresponse:" """ def train_load_evalaute_lm(): pipeline.fit(train_data) lm = OTModelLoader().load(path, **args) from fate_llm.evaluator import evaluator # general case evaluator.evaluate(lm, task="dolly_15k", **args) # user modified conf config = evaluator.get_task_template(task="dolly_15k") # return dict copy of yaml file config['dataset_kwargs'] = {"dataset_kwargs": {"data_files": {"test": './dolly_15k_test.csv', "dev": './dolly_15k_dev.csv'}}} # may provide arbitrary export path, must be of dir, create temp dir under the given path: {$export_path}/temp_dir new_task_dir = evaluator.export_config(config, task="dolly_15k", export_path=None) result = evaluator.evalute(lm, task="dolly_15k", include_path=new_task_dir, **args) print(result) # dict evaluator.delete_config(new_task_dir) """ ================================================ FILE: python/fate_llm/evaluate/utils/__init__.py ================================================ from ._parser import LlmJob, LlmPair, LlmSuite ================================================ FILE: python/fate_llm/evaluate/utils/_io.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import click import loguru # noinspection PyPep8Naming class echo(object): _file = None @classmethod def set_file(cls, file): cls._file = file @classmethod def echo(cls, message, **kwargs): click.secho(message, **kwargs) click.secho(message, file=cls._file, **kwargs) @classmethod def sep_line(cls): click.secho("-------------------------------------------------") @classmethod def file(cls, message, **kwargs): click.secho(message, file=cls._file, **kwargs) @classmethod def stdout(cls, message, **kwargs): click.secho(message, **kwargs) @classmethod def stdout_newline(cls): click.secho("") @classmethod def welcome(cls): cls.echo("Welcome to FATE Llm Evaluator") @classmethod def flush(cls): import sys sys.stdout.flush() def set_logger(name): loguru.logger.remove() loguru.logger.add(name, level='ERROR', delay=True) return loguru.logger LOGGER = loguru.logger ================================================ FILE: python/fate_llm/evaluate/utils/_parser.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import yaml import typing from pathlib import Path class LlmJob(object): def __init__(self, job_name: str, script_path: Path=None, conf_path: Path=None, model_task_name: str=None, pretrained_model_path: Path=None, peft_path: Path=None, eval_conf_path: Path=None, loader: str=None, loader_conf_path: Path=None, tasks: typing.List[str]=None, include_path: Path=None, peft_path_format: str=None): self.job_name = job_name self.script_path = script_path self.conf_path = conf_path self.model_task_name = model_task_name self.pretrained_model_path = pretrained_model_path self.peft_path = peft_path self.loader = loader self.loader_conf_path = loader_conf_path self.eval_conf_path = eval_conf_path self.tasks = tasks self.include_path = include_path self.evaluate_only = self.script_path is None self.peft_path_format = peft_path_format class LlmPair(object): def __init__( self, pair_name: str, jobs: typing.List[LlmJob] ): self.pair_name = pair_name self.jobs = jobs class LlmSuite(object): def __init__( self, pairs: typing.List[LlmPair], path: Path, dataset=None ): self.pairs = pairs self.path = path self.dataset = dataset self._final_status = {} @staticmethod def load(path: Path): if isinstance(path, str): path = Path(path) with path.open("r") as f: testsuite_config = yaml.safe_load(f) pairs = [] for pair_name, pair_configs in testsuite_config.items(): if pair_name == "data": continue jobs = [] for job_name, job_configs in pair_configs.items(): # with train script_path = job_configs.get("script", None) if script_path and not os.path.isabs(script_path): script_path = path.parent.joinpath(script_path).resolve() conf_path = job_configs.get("conf", None) if conf_path and not os.path.isabs(conf_path): conf_path = path.parent.joinpath(conf_path).resolve() model_task_name = job_configs.get("model_task_name", None) # evaluate only pretrained_model_path = job_configs.get("pretrained", None) if pretrained_model_path and not os.path.isabs(pretrained_model_path): # make path absolute, else keep original pretrained model name if "yaml" in pretrained_model_path or "/" in pretrained_model_path: pretrained_model_path = path.parent.joinpath(pretrained_model_path).resolve() peft_path = job_configs.get("peft", None) if peft_path and not os.path.isabs(peft_path): peft_path = path.parent.joinpath(peft_path).resolve() eval_conf_path = job_configs.get("eval_conf", None) if eval_conf_path and not os.path.isabs(eval_conf_path): eval_conf_path = path.parent.joinpath(eval_conf_path).resolve() loader = job_configs.get("loader", None) if job_configs.get("loader_conf"): loader_conf_path = path.parent.joinpath(job_configs["loader_conf"]).resolve() else: loader_conf_path = "" tasks = job_configs.get("tasks", []) include_path = job_configs.get("include_path", "") if include_path and not os.path.isabs(include_path): include_path = path.parent.joinpath(job_configs["include_path"]).resolve() peft_path_format = job_configs.get("peft_path_format", "{{fate_base}}/fate_flow/model/{{job_id}}/" "guest/{{party_id}}/{{model_task_name}}/0/" "output/output_model/model_directory") jobs.append( LlmJob( job_name=job_name, script_path=script_path, conf_path=conf_path, model_task_name=model_task_name, pretrained_model_path=pretrained_model_path, peft_path=peft_path, eval_conf_path=eval_conf_path, loader=loader, loader_conf_path=loader_conf_path, tasks=tasks, include_path=include_path, peft_path_format=peft_path_format ) ) pairs.append( LlmPair( pair_name=pair_name, jobs=jobs ) ) suite = LlmSuite(pairs=pairs, path=path) return suite def update_status( self, pair_name, job_name, job_id=None, status=None, exception_id=None, time_elapsed=None, event=None ): for k, v in locals().items(): if k != "job_name" and k != "pair_name" and v is not None: if self._final_status.get(f"{pair_name}-{job_name}"): setattr(self._final_status[f"{pair_name}-{job_name}"], k, v) def get_final_status(self): return self._final_status ================================================ FILE: python/fate_llm/evaluate/utils/config.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import click import yaml import typing from pathlib import Path from ._io import set_logger, echo DEFAULT_FATE_LLM_BASE_PATH = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) FATE_LLM_BASE_PATH = os.getenv("FATE_LLM_BASE_PATH") or DEFAULT_FATE_LLM_BASE_PATH # DEFAULT_TASK_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../tasks")) DEFAULT_FATE_LLM_TASK_PATH = os.path.abspath(os.path.join(FATE_LLM_BASE_PATH, "tasks")) FATE_LLM_TASK_PATH = os.getenv("FATE_LLM_TASK_PATH") or DEFAULT_FATE_LLM_TASK_PATH _default_eval_config = Path(FATE_LLM_BASE_PATH).resolve() / 'llm_eval_config.yaml' template = """# args for evaluate batch_size: 10 model_args: device: cuda dtype: auto trust_remote_code: true num_fewshot: 0 """ def create_eval_config(path: Path, override=False): if path.exists() and not override: raise FileExistsError(f"{path} exists") with path.open("w") as f: f.write(template) def default_eval_config(): if not _default_eval_config.exists(): create_eval_config(_default_eval_config) return _default_eval_config class Config(object): def __init__(self, config): self.update_conf(**config) def update_conf(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) @staticmethod def load(path: typing.Union[str, Path], **kwargs): if isinstance(path, str): path = Path(path) config = {} if path is not None: with path.open("r") as f: config.update(yaml.safe_load(f)) config.update(kwargs) return Config(config) @staticmethod def load_from_file(path: typing.Union[str, Path]): """ Loads conf content from yaml file. Used to read in parameter configuration Parameters ---------- path: str, path to conf file, should be absolute path Returns ------- dict, parameter configuration in dictionary format """ if isinstance(path, str): path = Path(path) config = {} if path is not None: file_type = path.suffix with path.open("r") as f: if file_type == ".yaml": config.update(yaml.safe_load(f)) else: raise ValueError(f"Cannot load conf from file type {file_type}") return config def parse_config(config): try: config_inst = Config.load(config) except Exception as e: raise RuntimeError(f"error parse config from {config}") from e return config_inst def _set_namespace(namespace): Path(f"logs/{namespace}").mkdir(exist_ok=True, parents=True) set_logger(f"logs/{namespace}/exception.log") echo.set_file(click.open_file(f'logs/{namespace}/stdout', "a")) ================================================ FILE: python/fate_llm/evaluate/utils/data_tools.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # def download_data(data_dir, data_url, is_tar=True): import os import requests import tarfile import io # Create data directory if not os.path.exists(data_dir): os.makedirs(data_dir) # Download data try: response = requests.get(data_url) if response.status_code == 200: if is_tar: # extract tar file and write to data_dir with tarfile.open(fileobj=io.BytesIO(response.content), mode='r:gz') as tar: for member in tar.getmembers(): # check if member is a file if member.isreg(): member.name = os.path.join(data_dir, os.path.basename(member.name)) tar.extract(member) else: # write to data_dir with open(os.path.join(data_dir, os.path.basename(data_url)), 'wb') as f: f.write(response.content) return True else: print(f"Error downloading file: {response.status_code}") return False except Exception as e: print(f"Error downloading file: {e}") return False ================================================ FILE: python/fate_llm/evaluate/utils/llm_evaluator.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # this file is used to evaluate the model on fate-llm built-in tasks and user-given tasks import os import tempfile import yaml import shutil import warnings from pytablewriter import MarkdownTableWriter import lm_eval from lm_eval.utils import load_yaml_config from ..tasks import build_in_tasks, dump_yaml from .config import FATE_LLM_BASE_PATH, FATE_LLM_TASK_PATH def evaluate(tasks, model="hf", model_args=None, include_path=None, task_manager=None, show_result=False, **kwargs): """ Evaluate the model on given tasks. Simplified uses for built-in tasks. Parameters ---------- tasks: str or List[str], task name(s) model: str or model object, model to be evaluated, select from lm_eval supported types: {"hf-auto", "hf", "huggingface", "vllm"} model_args: model args, str or dict include_path: task path for tasks not in built-in tasks task_manager: lm_eval.TakManger object kwargs Returns ------- """ if task_manager: if not isinstance(task_manager, lm_eval.tasks.TaskManager): raise ValueError(f"'task_manager' must be of TaskManager type.") elif include_path: task_manager = lm_eval.tasks.TaskManager(include_path=str(include_path)) else: task_manager = lm_eval.tasks.TaskManager(include_path=str(FATE_LLM_TASK_PATH)) task_names = [] if isinstance(tasks, str): task_names.append(tasks) elif isinstance(tasks, list): for task in tasks: if isinstance(task, str): task_names.append(task) else: raise ValueError(f"tasks: {task} of type {type(task)} not valid, please check.") else: raise ValueError(f"tasks: {tasks} of type {type(tasks)} not valid, please check.") results = lm_eval.simple_evaluate( model=model, model_args=model_args, tasks=task_names, task_manager=task_manager, **kwargs ) if show_result: result_table = lm_eval.utils.make_table(results) print(result_table) return results def aggregate_table(results): """ adapted from lm_eval.utils.make_table: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.2/lm_eval/utils.py Aggregate results from different models with same tasks Parameters ---------- results: dict, results from different models Returns ------- """ suite_writers = dict() for pair_name, pair_results in results.items(): # job_count = len(pair_results) all_jobs = list(pair_results.keys()) md_writer = MarkdownTableWriter() values = [] task_results = dict() # print(f"pair results: {pair_results}") for job_name, result_dict in pair_results.items(): if "results" in result_dict and result_dict["results"]: column = "results" else: column = "groups" for k, dic in result_dict[column].items(): if "alias" in dic: # task alias k = dic.pop("alias") for (mf), v in dic.items(): m, _, f = mf.partition(",") if m.endswith("_stderr"): continue if m + "_stderr" + "," + f in dic: se = dic[m + "_stderr" + "," + f] if se != "N/A": se = "%.4f" % se v = "%.4f ± %s" % (v, se) else: v = "%.4f" % v task_results.setdefault(k, {}).setdefault(job_name, {})[m] = v # job names as columns # print(f"task results: {task_results}") for task_name, task_result in task_results.items(): metrics = {inner_key for inner_dict in task_result.values() for inner_key, value in inner_dict.items()} for metric in metrics: row = [f"{task_name}({metric})"] for job_name in all_jobs: if job_name in task_result: row.append(task_result[job_name].get(metric, "N/A")) else: row.append("N/A") values.append(row) all_headers = ["Task"] + list(pair_results.keys()) md_writer.headers = all_headers md_writer.value_matrix = values suite_writers[pair_name] = md_writer return suite_writers def get_task_template(task): if not isinstance(task, str) or task not in build_in_tasks: raise ValueError(f"{task} not found in build in task, please check input.") result = build_in_tasks.get(task).task_template return result def export_config(config, task, export_dir=None, export_sub_dir=None): scr_dir = build_in_tasks.get(task).task_scr_dir if export_dir is None: export_dir = os.path.dirname(scr_dir) if export_sub_dir is None: temp_dir = tempfile.mkdtemp() # make sure the relative path in new file will work full_export_dir = os.path.join(export_dir, os.path.basename(temp_dir)) os.rename(temp_dir, full_export_dir) else: full_export_dir = os.path.join(export_dir, export_sub_dir) copy_directory_to_dst(scr_dir, full_export_dir, build_in_tasks.get(task).task_conf_path, config) return full_export_dir def copy_directory_to_dst(src_dir, dst_dir, target_conf_file, new_conf: dict): """parent_dir = os.path.dirname(src_dir) temp_dir = tempfile.mkdtemp() # make sure the relative path in new file will work temp_dir_in_parent = os.path.join(parent_dir, os.path.basename(temp_dir)) os.rename(temp_dir, temp_dir_in_parent)""" for item in os.listdir(src_dir): src_item = os.path.join(src_dir, item) dst_item = os.path.join(dst_dir, item) if os.path.isdir(src_item): shutil.copytree(src_item, dst_item) else: if item == target_conf_file: # write new conf file dump_yaml(new_conf, dst_item) else: shutil.copy2(src_item, dst_item) # shutil.copy2(src_item, dst_item) def contains_subdirectory(path, subdirectories): base_name = os.path.basename(path) if base_name in subdirectories: return True for root, dirs, files in os.walk(path): for d in dirs: if d in subdirectories: return True return False def delete_config(target_dir, force=False): if not force: # check if target dir in any of the build in tasks, only rm dir for build in tasks if force=True all_build_in_dir = {task.task_scr_dir for task in build_in_tasks.values()} if contains_subdirectory(target_dir, all_build_in_dir): warnings.warn(f"Built-in task(s) found in given target directory, please check input or set `force`=True.") return shutil.rmtree(target_dir) def set_environ_fate_llm_base(path): if path: os.environ["FATE_LLM_BASE_PATH"] = path def set_environ_fate_llm_task_base(path): if path: os.environ["FATE_LLM_TASK_PATH"] = path def init_tasks(root_path=None): """ Parameters ---------- root_path: str, default None, root path for all local datasets in built-in tasks, {$root_path}/{$data_files}; if not provided, current file path will be used to generate root Returns ------- """ for task in build_in_tasks.values(): conf_path = task.task_conf_path parent_path = os.path.dirname(conf_path) task_template = task.task_template data_args = task_template.get("dataset_kwargs") if data_args: data_files = data_args.get("data_files") if isinstance(data_files, str): if data_files.endswith("jsonl") or data_files.endswith("json"): if root_path: parent_dir = os.path.basename(parent_path) new_conf_path = os.path.join(root_path, parent_dir, os.path.basename(conf_path)) else: new_conf_path = os.path.join(parent_path, data_files) task_template["dataset_kwargs"]["data_files"] = new_conf_path elif isinstance(data_files, dict): for k, v in data_files.items(): if root_path: parent_dir = os.path.basename(parent_path) new_conf_path = os.path.join(root_path, parent_dir, os.path.basename(conf_path)) else: new_conf_path = os.path.join(parent_path, v) task_template["dataset_kwargs"]["data_files"][k] = new_conf_path try: dump_yaml(task_template, conf_path) except FileNotFoundError: raise ValueError(f"Cannot find task config {conf_path}, please check.") except Exception: raise ValueError(f"Initialization failed.") def download_task(tasks=None): if tasks is None: tasks = list(build_in_tasks.keys()) i = 1 if isinstance(tasks, str): tasks = [tasks] n = len(tasks) for task in tasks: task_obj = build_in_tasks.get(task) if task_obj is None: print(f"Task {task} not found in built-in tasks, please check.") continue result = task_obj.download_from_source() if result: print(f"Finish downloading {i}/{n} th task data: {task}, saved to {task_obj.task_scr_dir}.\n") else: print(f"Failed to download {i}/{n} th task data to {task_obj.task_scr_dir}.\n") i += 1 ================================================ FILE: python/fate_llm/evaluate/utils/model_tools.py ================================================ # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os from transformers import AutoModel, AutoTokenizer from lm_eval.models.huggingface import HFLM def load_model_from_path(model_path, peft_path=None, peft_config=None, model_args=None): model_args = model_args or {} if peft_path is None: if os.path.isfile(model_path): return HFLM(pretrained=model_path, **model_args) else: raise ValueError(f"given model path is not valid, please check: {model_path}") else: import torch from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModel.from_pretrained(model_path, trust_remote_code=True) model.half() model.eval() peft_config = peft_config or {} peft_config=LoraConfig(**peft_config) model = get_peft_model(model, peft_config) model.load_state_dict(torch.load(peft_path), strict=False) model.model.half() HFLM(pretrained=model, tokenizer=tokenizer, **model_args) def load_model(model_path, peft_path=None, model_args=None): model_args = model_args or {} return HFLM(pretrained=model_path, peft_path=peft_path, **model_args) def load_by_loader(loader_name=None, loader_conf_path=None, peft_path=None): #@todo: find loader fn & return loaded model pass ================================================ FILE: python/fate_llm/inference/__init__.py ================================================ ================================================ FILE: python/fate_llm/inference/api.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.inference.inference_base import Inference from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import GenerationConfig from typing import List class APICompletionInference(Inference): def __init__(self, api_url: str, model_name: str, api_key: str = 'EMPTY', api_timeout=3600): from openai import OpenAI self.model_name = model_name self.client = OpenAI( api_key=api_key, base_url=api_url, timeout=api_timeout ) def inference(self, docs: List[str], inference_kwargs: dict = {}) -> List[str]: completion = self.client.completions.create(model=self.model_name, prompt=docs, **inference_kwargs) rs_doc = [completion.choices[i].text for i in range(len(completion.choices))] return rs_doc ================================================ FILE: python/fate_llm/inference/hf_qw.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.inference.inference_base import Inference from transformers import AutoModelForCausalLM, AutoTokenizer from typing import List import tqdm class QwenHFCompletionInference(Inference): def __init__(self, model, tokenizer): self.model = model self.tokenizer = tokenizer def inference(self, docs: List[str], inference_kwargs: dict = {}) -> List[str]: self.model = self.model.eval() rs_list = [] for d in tqdm.tqdm(docs): inputs = self.tokenizer(d, return_tensors='pt') inputs = inputs.to(self.model.device) inputs.update(inference_kwargs) pred = self.model.generate(**inputs) response = self.tokenizer.decode(pred.cpu()[0][len(inputs['input_ids'][0]):], skip_special_tokens=True) rs_list.append(response) self.model = self.model.train() return rs_list ================================================ FILE: python/fate_llm/inference/inference_base.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from typing import List class Inference(object): def __init__(self): pass def inference(self, docs: List[str], inference_kwargs: dict = {}) -> List[str]: raise NotImplementedError() ================================================ FILE: python/fate_llm/inference/vllm.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.inference.inference_base import Inference from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import GenerationConfig import logging from typing import List logger = logging.getLogger(__name__) class VLLMInference(Inference): def __init__(self, model_path, num_gpu=1, dtype='float16', gpu_memory_utilization=0.9): from vllm import LLM self.llm = LLM(model=model_path, trust_remote_code=True, dtype=dtype, tensor_parallel_size=num_gpu, gpu_memory_utilization=gpu_memory_utilization) logger.info('vllm model init done, model path is {}'.format(model_path)) def inference(self, docs: List[str], inference_kwargs: dict = {}) -> List[str]: from vllm import SamplingParams param = SamplingParams(**inference_kwargs) outputs = self.llm.generate( prompts=docs, sampling_params=param) rs = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text rs.append(generated_text) return rs ================================================ FILE: python/fate_llm/model_zoo/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/model_zoo/embedding_transformer/__init__.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================ FILE: python/fate_llm/model_zoo/embedding_transformer/st_model.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from sentence_transformers import SentenceTransformer from typing import Any, Optional, Dict, Union class SentenceTransformerModel(object): def __init__( self, model_name_or_path: Optional[str] = None, device: Optional[str] = None, prompts: Optional[Dict[str, str]] = None, default_prompt_name: Optional[str] = None, cache_folder: Optional[str] = None, trust_remote_code: bool = False, revision: Optional[str] = None, local_files_only: bool = False, token: Optional[Union[bool, str]] = None, use_auth_token: Optional[Union[bool, str]] = None, truncate_dim: Optional[int] = None, model_kwargs: Optional[Dict[str, Any]] = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, config_kwargs: Optional[Dict[str, Any]] = None, ) -> None: self.model_name_or_path = model_name_or_path self.device = device self.prompts = prompts self.default_prompt_name = default_prompt_name self.cache_folder = cache_folder self.trust_remote_code = trust_remote_code self.revision = revision self.local_files_only = local_files_only self.token = token self.use_auth_token = use_auth_token self.truncate_dim = truncate_dim self.model_kwargs = model_kwargs self.tokenizer_kwargs = tokenizer_kwargs self.config_kwargs = config_kwargs def load(self): model = SentenceTransformer( model_name_or_path=self.model_name_or_path, device=self.device, prompts=self.prompts, default_prompt_name=self.default_prompt_name, cache_folder=self.cache_folder, trust_remote_code=self.trust_remote_code, revision=self.revision, local_files_only=self.local_files_only, token=self.token, use_auth_token=self.use_auth_token, truncate_dim=self.truncate_dim, model_kwargs=self.model_kwargs, tokenizer_kwargs=self.tokenizer_kwargs, config_kwargs=self.config_kwargs ) return model ================================================ FILE: python/fate_llm/model_zoo/hf_model.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch from transformers import AutoModelForCausalLM class HFAutoModelForCausalLM: def __init__(self, pretrained_model_name_or_path, *model_args, **kwargs) -> None: self.pretrained_model_name_or_path = pretrained_model_name_or_path self.model_args = model_args self.kwargs = kwargs if "torch_dtype" in self.kwargs and self.kwargs["torch_dtype"] != "auto": dtype = self.kwargs.pop("torch_dtype") self.kwargs["torch_dtype"] = getattr(torch, dtype) def load(self): model = AutoModelForCausalLM.from_pretrained( self.pretrained_model_name_or_path, *self.model_args, **self.kwargs ) return model ================================================ FILE: python/fate_llm/model_zoo/offsite_tuning/__init__.py ================================================ ================================================ FILE: python/fate_llm/model_zoo/offsite_tuning/bloom.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel, get_dropout_emulator_and_adapters, split_numpy_array, recover_numpy_array from transformers.models.bloom.modeling_bloom import BloomForCausalLM, BloomModel, BloomConfig from torch import nn import torch from typing import Optional, Tuple class BloomMainModel(OffsiteTuningMainModel): def __init__( self, model_name_or_path, emulator_layer_num: int, adapter_top_layer_num: int = 2, adapter_bottom_layer_num: int = 2): self.model_name_or_path = model_name_or_path super().__init__( emulator_layer_num, adapter_top_layer_num, adapter_bottom_layer_num) def get_base_model(self): return BloomForCausalLM.from_pretrained(self.model_name_or_path) def get_model_transformer_blocks(self, model: BloomForCausalLM): return model.transformer.h def get_additional_param_state_dict(self): # get parameter of additional parameter model = self.model param_dict = { 'wte': model.transformer.word_embeddings, 'word_ln': model.transformer.word_embeddings_layernorm, 'last_ln_f': model.transformer.ln_f } addition_weights = self.get_numpy_state_dict(param_dict) wte = addition_weights.pop('wte') wte_dict = split_numpy_array(wte, 25, 'wte') addition_weights.update(wte_dict) return addition_weights def load_additional_param_state_dict(self, submodel_weights: dict): # load additional weights: model = self.model param_dict = { 'wte': model.transformer.word_embeddings, 'word_ln': model.transformer.word_embeddings_layernorm, 'last_ln_f': model.transformer.ln_f } new_submodel_weight = {} new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f'] new_submodel_weight['word_ln'] = submodel_weights['word_ln'] wte_dict = {} for k, v in submodel_weights.items(): if 'wte' in k: wte_dict[k] = v wte = recover_numpy_array(wte_dict, 'wte') new_submodel_weight['wte'] = wte self.load_numpy_state_dict(param_dict, new_submodel_weight) def forward( self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, **deprecated_arguments, ): return self.model( input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments, ) class BloomSubModel(OffsiteTuningSubModel): def __init__( self, model_name_or_path, emulator_layer_num: int, adapter_top_layer_num: int = 2, adapter_bottom_layer_num: int = 2, fp16_mix_precision=False, partial_weight_decay=None): self.model_name_or_path = model_name_or_path self.emulator_layer_num = emulator_layer_num self.adapter_top_layer_num = adapter_top_layer_num self.adapter_bottom_layer_num = adapter_bottom_layer_num super().__init__( emulator_layer_num, adapter_top_layer_num, adapter_bottom_layer_num, fp16_mix_precision) self.partial_weight_decay = partial_weight_decay def get_base_model(self): total_layer_num = self.emulator_layer_num + \ self.adapter_top_layer_num + self.adapter_bottom_layer_num config = BloomConfig.from_pretrained(self.model_name_or_path) config.num_hidden_layers = total_layer_num # initialize a model without pretrained weights return BloomForCausalLM(config) def get_model_transformer_blocks(self, model: BloomForCausalLM): return model.transformer.h def get_additional_param_state_dict(self): # get parameter of additional parameter model = self.model param_dict = { 'wte': model.transformer.word_embeddings, 'word_ln': model.transformer.word_embeddings_layernorm, 'last_ln_f': model.transformer.ln_f } addition_weights = self.get_numpy_state_dict(param_dict) wte = addition_weights.pop('wte') wte_dict = split_numpy_array(wte, 25, 'wte') addition_weights.update(wte_dict) return addition_weights def load_additional_param_state_dict(self, submodel_weights: dict): # load additional weights: model = self.model param_dict = { 'wte': model.transformer.word_embeddings, 'word_ln': model.transformer.word_embeddings_layernorm, 'last_ln_f': model.transformer.ln_f } new_submodel_weight = {} new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f'] new_submodel_weight['word_ln'] = submodel_weights['word_ln'] wte_dict = {} for k, v in submodel_weights.items(): if 'wte' in k: wte_dict[k] = v wte = recover_numpy_array(wte_dict, 'wte') new_submodel_weight['wte'] = wte self.load_numpy_state_dict(param_dict, new_submodel_weight) def forward( self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, **deprecated_arguments, ): return self.model( input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments, ) def parameters(self, recurse=True): if self.partial_weight_decay is None: return super().parameters(recurse) elif isinstance(self.partial_weight_decay, float): no_decay = ["bias", "layer_norm.weight"] return [ { "params": [ p for n, p in self.named_parameters() if not any( nd in n for nd in no_decay)], "weight_decay": self.partial_weight_decay}, { "params": [ p for n, p in self.named_parameters() if any( nd in n for nd in no_decay)], "weight_decay": 0.0}] else: raise ValueError( f"partial_weight_decay should be None or float, but got {self.partial_weight_decay}") ================================================ FILE: python/fate_llm/model_zoo/offsite_tuning/gpt2.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel, get_dropout_emulator_and_adapters, split_numpy_array, recover_numpy_array from transformers import GPT2LMHeadModel, GPT2Config import torch from typing import Optional, Tuple class GPT2LMHeadMainModel(OffsiteTuningMainModel): def __init__( self, model_name_or_path, emulator_layer_num: int, adapter_top_layer_num: int = 2, adapter_bottom_layer_num: int = 2): self.model_name_or_path = model_name_or_path super().__init__( emulator_layer_num, adapter_top_layer_num, adapter_bottom_layer_num) def get_base_model(self): return GPT2LMHeadModel.from_pretrained(self.model_name_or_path) def get_model_transformer_blocks(self, model: GPT2LMHeadModel): return model.transformer.h def forward(self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None,): return self.model( input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) def get_additional_param_state_dict(self): # get parameter of additional parameter model = self.model param_dict = { 'wte': model.transformer.wte, 'wpe': model.transformer.wpe, 'last_ln_f': model.transformer.ln_f } addition_weights = self.get_numpy_state_dict(param_dict) wte = addition_weights.pop('wte') wte_dict = split_numpy_array(wte, 10, 'wte') wpe = addition_weights.pop('wpe') wpe_dict = split_numpy_array(wpe, 10, 'wpe') addition_weights.update(wte_dict) addition_weights.update(wpe_dict) return addition_weights def load_additional_param_state_dict(self, submodel_weights: dict): # load additional weights: model = self.model param_dict = { 'wte': model.transformer.wte, 'wpe': model.transformer.wpe, 'last_ln_f': model.transformer.ln_f } new_submodel_weight = {} new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f'] wte_dict, wpe_dict = {}, {} for k, v in submodel_weights.items(): if 'wte' in k: wte_dict[k] = v if 'wpe' in k: wpe_dict[k] = v wte = recover_numpy_array(wte_dict, 'wte') wpe = recover_numpy_array(wpe_dict, 'wpe') new_submodel_weight['wte'] = wte new_submodel_weight['wpe'] = wpe self.load_numpy_state_dict(param_dict, new_submodel_weight) class GPT2LMHeadSubModel(OffsiteTuningSubModel): def __init__( self, model_name_or_path, emulator_layer_num: int, adapter_top_layer_num: int = 2, adapter_bottom_layer_num: int = 2, fp16_mix_precision=False, partial_weight_decay=None): self.model_name_or_path = model_name_or_path self.emulator_layer_num = emulator_layer_num self.adapter_top_layer_num = adapter_top_layer_num self.adapter_bottom_layer_num = adapter_bottom_layer_num super().__init__( emulator_layer_num, adapter_top_layer_num, adapter_bottom_layer_num, fp16_mix_precision) self.partial_weight_decay = partial_weight_decay def get_base_model(self): total_layer_num = self.emulator_layer_num + \ self.adapter_top_layer_num + self.adapter_bottom_layer_num config = GPT2Config.from_pretrained(self.model_name_or_path) config.num_hidden_layers = total_layer_num # initialize a model without pretrained weights return GPT2LMHeadModel(config) def get_model_transformer_blocks(self, model: GPT2LMHeadModel): return model.transformer.h def get_additional_param_state_dict(self): # get parameter of additional parameter model = self.model param_dict = { 'wte': model.transformer.wte, 'wpe': model.transformer.wpe, 'last_ln_f': model.transformer.ln_f } addition_weights = self.get_numpy_state_dict(param_dict) wte = addition_weights.pop('wte') wte_dict = split_numpy_array(wte, 10, 'wte') wpe = addition_weights.pop('wpe') wpe_dict = split_numpy_array(wpe, 10, 'wpe') addition_weights.update(wte_dict) addition_weights.update(wpe_dict) return addition_weights def load_additional_param_state_dict(self, submodel_weights: dict): # load additional weights: model = self.model param_dict = { 'wte': model.transformer.wte, 'wpe': model.transformer.wpe, 'last_ln_f': model.transformer.ln_f } new_submodel_weight = {} new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f'] wte_dict, wpe_dict = {}, {} for k, v in submodel_weights.items(): if 'wte' in k: wte_dict[k] = v if 'wpe' in k: wpe_dict[k] = v wte = recover_numpy_array(wte_dict, 'wte') wpe = recover_numpy_array(wpe_dict, 'wpe') new_submodel_weight['wte'] = wte new_submodel_weight['wpe'] = wpe self.load_numpy_state_dict(param_dict, new_submodel_weight) def forward(self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None,): return self.model( input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) def parameters(self, recurse=True): if self.partial_weight_decay is None: return super().parameters(recurse) elif isinstance(self.partial_weight_decay, float): no_decay = ["bias", "layer_norm.weight"] return [ { "params": [ p for n, p in self.named_parameters() if not any( nd in n for nd in no_decay)], "weight_decay": self.partial_weight_decay}, { "params": [ p for n, p in self.named_parameters() if any( nd in n for nd in no_decay)], "weight_decay": 0.0}] else: raise ValueError( f"partial_weight_decay should be None or float, but got {self.partial_weight_decay}") ================================================ FILE: python/fate_llm/model_zoo/offsite_tuning/llama.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel, get_dropout_emulator_and_adapters, split_numpy_array, recover_numpy_array from transformers import LlamaConfig, LlamaForCausalLM class LlamaMainModel(OffsiteTuningMainModel): def __init__( self, model_name_or_path, emulator_layer_num: int, adapter_top_layer_num: int = 2, adapter_bottom_layer_num: int = 2): self.model_name_or_path = model_name_or_path super().__init__( emulator_layer_num, adapter_top_layer_num, adapter_bottom_layer_num) def get_base_model(self): return LlamaForCausalLM.from_pretrained(self.model_name_or_path) def get_model_transformer_blocks(self, model: LlamaForCausalLM): return model.model.layers def get_additional_param_state_dict(self): # get parameter of additional parameter model = self.model param_dict = { 'wte': model.model.embed_tokens, 'last_ln_f': model.model.norm } addition_weights = self.get_numpy_state_dict(param_dict) wte = addition_weights.pop('wte') wte_dict = split_numpy_array(wte, 25, 'wte') addition_weights.update(wte_dict) return addition_weights def load_additional_param_state_dict(self, submodel_weights: dict): # load additional weights: model = self.model param_dict = { 'wte': model.model.embed_tokens, 'last_ln_f': model.model.norm } new_submodel_weight = {} new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f'] wte_dict = {} for k, v in submodel_weights.items(): if 'wte' in k: wte_dict[k] = v wte = recover_numpy_array(wte_dict, 'wte') new_submodel_weight['wte'] = wte self.load_numpy_state_dict(param_dict, new_submodel_weight) def forward(self, **kwargs): return self.model(**kwargs) class LlamaSubModel(OffsiteTuningSubModel): def __init__( self, model_name_or_path, emulator_layer_num: int, adapter_top_layer_num: int = 2, adapter_bottom_layer_num: int = 2, fp16_mix_precision=False, partial_weight_decay=None): self.model_name_or_path = model_name_or_path self.emulator_layer_num = emulator_layer_num self.adapter_top_layer_num = adapter_top_layer_num self.adapter_bottom_layer_num = adapter_bottom_layer_num super().__init__( emulator_layer_num, adapter_top_layer_num, adapter_bottom_layer_num, fp16_mix_precision) self.partial_weight_decay = partial_weight_decay def get_base_model(self): total_layer_num = self.emulator_layer_num + \ self.adapter_top_layer_num + self.adapter_bottom_layer_num config = LlamaConfig.from_pretrained(self.model_name_or_path) config.num_hidden_layers = total_layer_num # initialize a model without pretrained weights return LlamaForCausalLM(config) def get_model_transformer_blocks(self, model: LlamaForCausalLM): return model.model.layers def get_additional_param_state_dict(self): # get parameter of additional parameter model = self.model param_dict = { 'wte': model.model.embed_tokens, 'last_ln_f': model.model.norm } addition_weights = self.get_numpy_state_dict(param_dict) wte = addition_weights.pop('wte') wte_dict = split_numpy_array(wte, 25, 'wte') addition_weights.update(wte_dict) return addition_weights def load_additional_param_state_dict(self, submodel_weights: dict): # load additional weights: model = self.model param_dict = { 'wte': model.model.embed_tokens, 'last_ln_f': model.model.norm } new_submodel_weight = {} new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f'] wte_dict = {} for k, v in submodel_weights.items(): if 'wte' in k: wte_dict[k] = v wte = recover_numpy_array(wte_dict, 'wte') new_submodel_weight['wte'] = wte self.load_numpy_state_dict(param_dict, new_submodel_weight) def forward(self, **kwargs): return self.model(**kwargs) def parameters(self, recurse=True): if self.partial_weight_decay is None: return super().parameters(recurse) elif isinstance(self.partial_weight_decay, float): no_decay = ["bias", "layer_norm.weight"] return [ { "params": [ p for n, p in self.named_parameters() if not any( nd in n for nd in no_decay)], "weight_decay": self.partial_weight_decay}, { "params": [ p for n, p in self.named_parameters() if any( nd in n for nd in no_decay)], "weight_decay": 0.0}] else: raise ValueError( f"partial_weight_decay should be None or float, but got {self.partial_weight_decay}") ================================================ FILE: python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch as t from torch import nn from transformers import AutoModel import numpy as np import logging logger = logging.getLogger(__name__) def get_dropout_emulator_and_adapters( transformer_layers: nn.ModuleList, emulator_layer_num: int, adapter_top_layer_num: int, adapter_bottom_layer_num: int): assert adapter_bottom_layer_num > 0 and adapter_top_layer_num > 0, "adapter layer num must be greater than 0" assert emulator_layer_num < len( transformer_layers), "emulator layer num must be less than the number of transformer layers" assert adapter_bottom_layer_num + adapter_top_layer_num < len( transformer_layers), "adapter layer num must be less than the number of transformer layers" assert emulator_layer_num < len( transformer_layers) and emulator_layer_num > 0, "emulator layer num must be less than the number of transformer layers" bottom_idx = adapter_bottom_layer_num top_idx = len(transformer_layers) - adapter_top_layer_num bottom_layers = transformer_layers[:bottom_idx] top_layers = transformer_layers[top_idx:] kept_layers = transformer_layers[bottom_idx:top_idx] emulator = nn.ModuleList() stride = (len(kept_layers) - 1) / (emulator_layer_num - 1) layer_idx = [] for i in range(emulator_layer_num): idx = int(round(i * stride)) layer_idx.append(idx) emulator.append(kept_layers[idx]) logger.info( 'take layer {} of the original model as the emulator'.format( t.Tensor(layer_idx) + bottom_idx)) return nn.ModuleList(emulator), nn.ModuleList( bottom_layers), nn.ModuleList(top_layers) def split_numpy_array(embedding_matrix, n, suffix): # Calculate the indices where the splits should occur embedding_matrix = embedding_matrix['weight'] indices = np.linspace(0, embedding_matrix.shape[0], n+1, dtype=int) # Split the embedding matrix at the calculated indices slices = [embedding_matrix[indices[i]:indices[i+1]] for i in range(n)] # Create a dictionary with the slices result_dict = {suffix+str(i): slice for i, slice in enumerate(slices)} return result_dict def recover_numpy_array(slices_dict, suffix=""): # Get the slices from the dictionary and concatenate them slices = [slices_dict[suffix + str(i)] for i in range(len(slices_dict))] complete_array = np.concatenate(slices, axis=0) return {'weight': complete_array} class OffsiteTuningBaseModel(t.nn.Module): def __init__(self, emulator_layer_num: int, adapter_top_layer_num: int = 2, adapter_bottom_layer_num: int = 2, fp16_mix_precision=False): super().__init__() self.fp16_mix_precision = fp16_mix_precision self.model = self.get_base_model() self.initialize_model() self.emulator, self.adapter_bottom, self.adapter_top = get_dropout_emulator_and_adapters( transformer_layers=self.get_model_transformer_blocks(self.model), emulator_layer_num=emulator_layer_num, adapter_top_layer_num=adapter_top_layer_num, adapter_bottom_layer_num=adapter_bottom_layer_num ) self.post_initialization() def initialize_model(self): if self.fp16_mix_precision: self.model.half() for param in self.model.parameters(): param.requires_grad = False def post_initialization(self): pass def get_adapter_top(self): return self.adapter_top def get_adapter_bottom(self): return self.adapter_bottom def get_emulator(self): return self.emulator def get_additional_param_state_dict(self): # get parameter of additional parameter return {} def load_additional_param_state_dict(self, submodel_weights: dict): # load additional weights: pass def _get_numpy_arr(self, v): if v.dtype == t.bfloat16: # float 32 v = v.detach().cpu().float().numpy() else: v = v.detach().cpu().numpy() return v def load_numpy_state_dict(self, module_dict, state_dict): param_dict = module_dict for k, v in param_dict.items(): if k not in state_dict: continue addition_weights = { k: t.tensor(v) for k, v in state_dict[k].items()} v.load_state_dict(addition_weights) def get_numpy_state_dict(self, module_dict): weight_dict = {} for k, v in module_dict.items(): weight_dict[k] = { k: self._get_numpy_arr(v) for k, v in v.state_dict().items()} return weight_dict def get_submodel_weights(self, with_emulator=True) -> dict: if with_emulator: submodel_weights = { "emulator": { k: self._get_numpy_arr(v) for k, v in self.get_emulator().state_dict().items()}, "adapter_top": { k: self._get_numpy_arr(v) for k, v in self.get_adapter_top().state_dict().items()}, "adapter_bottom": { k: self._get_numpy_arr(v) for k, v in self.get_adapter_bottom().state_dict().items()}} else: submodel_weights = { "adapter_top": { k: self._get_numpy_arr(v) for k, v in self.get_adapter_top().state_dict().items()}, "adapter_bottom": { k: self._get_numpy_arr(v) for k, v in self.get_adapter_bottom().state_dict().items()}} addition_weights = self.get_additional_param_state_dict() submodel_weights.update(addition_weights) return submodel_weights def load_submodel_weights(self, submodel_weights: dict, with_emulator=True): if with_emulator: emulator_weights = { k: t.tensor(v) for k, v in submodel_weights['emulator'].items()} emulator = self.get_emulator() emulator.load_state_dict(emulator_weights) adapter_top_weights = { k: t.tensor(v) for k, v in submodel_weights['adapter_top'].items()} adapter_bottom_weights = { k: t.tensor(v) for k, v in submodel_weights['adapter_bottom'].items()} adapter_top = self.get_adapter_top() adapter_bottom = self.get_adapter_bottom() adapter_top.load_state_dict(adapter_top_weights) adapter_bottom.load_state_dict(adapter_bottom_weights) self.load_additional_param_state_dict(submodel_weights) def forward(self, **kwargs): raise NotImplementedError() def get_base_model(self): raise NotImplementedError() def get_model_transformer_blocks(self, model: t.nn.Module): raise NotImplementedError() class OffsiteTuningMainModel(OffsiteTuningBaseModel): def post_initialization(self): pass class OffsiteTuningSubModel(OffsiteTuningBaseModel): def post_initialization(self): # mix precision model training for param in self.adapter_top.parameters(): param.data = param.data.float() param.requires_grad = True for param in self.adapter_bottom.parameters(): param.data = param.data.float() param.requires_grad = True ================================================ FILE: python/fate_llm/model_zoo/pellm/__init__.py ================================================ ================================================ FILE: python/fate_llm/model_zoo/pellm/albert.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import AlbertConfig, AutoConfig from transformers import AlbertForSequenceClassification from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class Albert(PELLM): config_class = AlbertConfig model_loader = AlbertForSequenceClassification def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs ) -> None: if pretrained_path is not None: self.check_config(pretain_path=pretrained_path) if config is None and pretrained_path is None: config = AlbertConfig().to_dict() # use default model setting super().__init__( config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) def check_config(self, pretain_path): config = AutoConfig.from_pretrained(pretain_path) assert isinstance( config, AlbertConfig), 'The config of pretrained model must be AlbertConfig, but got {}'.format( type(config)) ================================================ FILE: python/fate_llm/model_zoo/pellm/bart.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import BartConfig, AutoConfig from transformers import BartForSequenceClassification from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class Bart(PELLM): config_class = BartConfig model_loader = BartForSequenceClassification def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs) -> None: if pretrained_path is not None: self.check_config(pretrain_path=pretrained_path) if config is None and pretrained_path is None: config = BartConfig().to_dict() super().__init__( config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) def check_config(self, pretrain_path): config = AutoConfig.from_pretrained(pretrain_path) assert isinstance( config, BartConfig), 'The config of pretrained model must be BartConfig, but got {}'.format( type(config)) ================================================ FILE: python/fate_llm/model_zoo/pellm/bert.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import BertConfig, AutoConfig from transformers import BertForSequenceClassification from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class Bert(PELLM): config_class = BertConfig model_loader = BertForSequenceClassification def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs) -> None: if pretrained_path is not None: self.check_config(pretrain_path=pretrained_path) if config is None and pretrained_path is None: config = BertConfig().to_dict() super().__init__( config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) def check_config(self, pretrain_path): config = AutoConfig.from_pretrained(pretrain_path) assert isinstance( config, BertConfig), 'The config of pretrained model must be BertConfig, but got {}'.format( type(config)) ================================================ FILE: python/fate_llm/model_zoo/pellm/bloom.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import BloomConfig from transformers import BloomForCausalLM from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class Bloom(PELLM): config_class = BloomConfig model_loader = BloomForCausalLM def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs ) -> None: if config is None and pretrained_path is None: config = BloomConfig().to_dict() # use default model setting super().__init__(config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) ================================================ FILE: python/fate_llm/model_zoo/pellm/chatglm.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM from transformers import AutoConfig class ChatGLM(PELLM): def __init__(self, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, pre_seq_len: int = None, prefix_projection: bool = False, **kwargs) -> None: self.pre_seq_len = pre_seq_len self.prefix_projection = prefix_projection super().__init__(pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs ) def init_config(self): self.config = AutoConfig.from_pretrained( self.config_path, trust_remote_code=True) self.config.pre_seq_len = self.pre_seq_len self.config.prefix_projection = self.prefix_projection def add_peft(self): if self.pre_seq_len: self._pe_lm.half() self._pe_lm.transformer.prefix_encoder.float() else: super(ChatGLM, self).add_peft() ================================================ FILE: python/fate_llm/model_zoo/pellm/deberta.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import DebertaConfig, AutoConfig from transformers import DebertaForSequenceClassification from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class Deberta(PELLM): config_class = DebertaConfig model_loader = DebertaForSequenceClassification def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs) -> None: if pretrained_path is not None: self.check_config(pretrain_path=pretrained_path) if config is None and pretrained_path is None: config = DebertaConfig().to_dict() super().__init__( config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) def check_config(self, pretrain_path): config = AutoConfig.from_pretrained(pretrain_path) assert isinstance( config, DebertaConfig), 'The config of pretrained model must be DebertaConfig, but got {}'.format( type(config)) ================================================ FILE: python/fate_llm/model_zoo/pellm/distilbert.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import DistilBertConfig, AutoConfig from transformers import DistilBertForSequenceClassification from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class DistilBert(PELLM): config_class = DistilBertConfig model_loader = DistilBertForSequenceClassification def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs) -> None: if pretrained_path is not None: self.check_config(pretrain_path=pretrained_path) if config is None and pretrained_path is None: config = DistilBertConfig().to_dict() super().__init__( config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) def check_config(self, pretrain_path): config = AutoConfig.from_pretrained(pretrain_path) assert isinstance( config, DistilBertConfig), 'The config of pretrained model must be DistilBertConfig, but got {}'.format( type(config)) ================================================ FILE: python/fate_llm/model_zoo/pellm/gpt2.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import GPT2Config, AutoConfig from transformers import GPT2ForSequenceClassification, AutoModelForCausalLM from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class GPT2(PELLM): config_class = GPT2Config model_loader = GPT2ForSequenceClassification def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs) -> None: if pretrained_path is not None: self.check_config(pretrain_path=pretrained_path) if config is None and pretrained_path is None: config = GPT2Config().to_dict() super().__init__( config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) def check_config(self, pretrain_path): config = AutoConfig.from_pretrained(pretrain_path) assert isinstance( config, GPT2Config), 'The config of pretrained model must be GPT2Config, but got {}'.format( type(config)) class GPT2CLM(GPT2): model_loader = AutoModelForCausalLM ================================================ FILE: python/fate_llm/model_zoo/pellm/llama.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM from transformers import AutoConfig from transformers import LlamaConfig from transformers import LlamaForCausalLM class LLaMa(PELLM): config_class = LlamaConfig def __init__(self, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs) -> None: super().__init__(pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) def init_base_lm(self, **kwargs): if self.config is not None: self._pe_lm = LlamaForCausalLM.from_pretrained(self.config_path, config=self.config, torch_dtype=self.torch_dtype, **kwargs) elif self.config_path is not None: self._pe_lm = LlamaForCausalLM.from_pretrained(self.config_path, torch_dtype=self.torch_dtype, **kwargs) else: raise ValueError( 'config_path to pretrained model folder cannot be None') def check_config(self, pretrain_path): config = AutoConfig.from_pretrained(pretrain_path) assert isinstance( config, LlamaConfig), 'The config of pretrained model must be LlamaConfig, but got {}'.format( type(config)) ================================================ FILE: python/fate_llm/model_zoo/pellm/opt.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import OPTConfig from transformers import OPTForCausalLM from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class OPT(PELLM): config_class = OPTConfig model_loader = OPTForCausalLM def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs ) -> None: if config is None and pretrained_path is None: config = OPTConfig().to_dict() # use default model setting super().__init__(config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) ================================================ FILE: python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import peft import torch from collections.abc import Mapping from peft import PeftModel, TaskType from transformers import AutoConfig from transformers import AutoModel from transformers.configuration_utils import PretrainedConfig import logging logger = logging.getLogger(__name__) AVAILABLE_PEFT_CONFIG = list( filter( lambda peft_type: peft_type.endswith("Config"), dir(peft) ) ) class PELLM(torch.nn.Module): config_class: PretrainedConfig = None model_loader = None def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config=None, torch_dtype: str = None, trust_remote_code: bool = False, **kwargs ) -> None: super().__init__() self._pe_lm: PeftModel = None self.config = config self.config_path = pretrained_path self.peft_type = peft_type self.peft_config = peft_config self.torch_dtype = None if not torch_dtype else getattr(torch, torch_dtype) self.trust_remote_code = trust_remote_code assert self.config_path is not None or self.config is not None, \ "At least one of config_path and config must be set." self._init_pelm(**kwargs) def _init_pelm(self, **kwargs): self.init_lm_with_peft(**kwargs) self.model_summary() def init_lm_with_peft(self, **kwargs): self.init_config(**kwargs) self.init_base_lm() self.add_peft() def init_config(self, **kwargs): if self.config_path is not None: self.config = AutoConfig.from_pretrained(self.config_path, trust_remote_code=self.trust_remote_code) elif self.config is not None and self.config_class is not None: self.config = self.config_class().from_dict(self.config) else: raise ValueError( 'config_path to pretrained model folder and model config dict cannot be None at the same time, ' 'you need to specify one of them') if kwargs: self.config.update(kwargs) def init_base_lm(self, **kwargs): model_loader = self.model_loader if self.model_loader is not None else AutoModel if self.config is not None: self._pe_lm = model_loader.from_pretrained( self.config_path, config=self.config, torch_dtype=self.torch_dtype, **kwargs, trust_remote_code=self.trust_remote_code ) elif self.config_path is not None: self._pe_lm = model_loader.from_pretrained( self.config_path, torch_dtype=self.torch_dtype, trust_remote_code=self.trust_remote_code, **kwargs) else: raise ValueError( 'config_path to pretrained model folder cannot be None') def add_peft(self): assert self.peft_type in AVAILABLE_PEFT_CONFIG, 'peft name {} not in available config {}'.format( self.peft_type, AVAILABLE_PEFT_CONFIG) if self.peft_config is None: peft_config = getattr(peft, self.peft_type)() elif isinstance(self.peft_config, dict): peft_config = getattr(peft, self.peft_type)(**self.peft_config) else: raise ValueError(f"Can not parse peft_config of {type(self.peft_config)}") self._pe_lm = peft.get_peft_model(self._pe_lm, peft_config) self.peft_config = peft_config def model_summary(self): if hasattr(self._pe_lm, "print_trainable_parameters"): summary = self._pe_lm.print_trainable_parameters() logger.debug(f'PELLM model summary: \n{summary}') def forward(self, *args, **kwargs): forward_ret = self._pe_lm.forward(*args, **kwargs) if self.peft_config is None or self.peft_config.task_type != TaskType.SEQ_CLS: return forward_ret else: return forward_ret.logits def save_trainable(self, output_path): self._pe_lm.save_pretrained(output_path) class AutoPELLM(PELLM): def __init__(self, **kwargs) -> None: super().__init__(**kwargs) ================================================ FILE: python/fate_llm/model_zoo/pellm/qwen.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import Qwen2Config from transformers import Qwen2ForCausalLM from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class Qwen(PELLM): config_class = Qwen2Config model_loader = Qwen2ForCausalLM def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs ) -> None: if config is None and pretrained_path is None: config = Qwen2Config().to_dict() # use default model setting super().__init__(config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) ================================================ FILE: python/fate_llm/model_zoo/pellm/roberta.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import RobertaConfig, AutoConfig from transformers import RobertaForSequenceClassification from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM class Roberta(PELLM): config_class = RobertaConfig model_loader = RobertaForSequenceClassification def __init__(self, config: dict = None, pretrained_path: str = None, peft_type: str = None, peft_config: dict = None, **kwargs) -> None: if pretrained_path is not None: self.check_config(pretrain_path=pretrained_path) if config is None and pretrained_path is None: config = RobertaConfig().to_dict() super().__init__( config=config, pretrained_path=pretrained_path, peft_type=peft_type, peft_config=peft_config, **kwargs) def check_config(self, pretrain_path): config = AutoConfig.from_pretrained(pretrain_path) assert isinstance( config, RobertaConfig), 'The config of pretrained model must be RobertaConfig, but got {}'.format( type(config)) ================================================ FILE: python/fate_llm/runner/__init__.py ================================================ ================================================ FILE: python/fate_llm/runner/fdkt_runner.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import torch from fate.components.components.nn.nn_runner import ( load_model_dict_from_path, dir_warning, loader_load_from_conf, run_dataset_func, ) from typing import Dict from fate.components.components.nn.loader import Loader from typing import Union, Optional, Literal from transformers.trainer_utils import get_last_checkpoint from fate.arch.dataframe import DataFrame from fate.components.components.nn.runner.homo_default_runner import DefaultRunner from fate_llm.algo.fdkt import FDKTTrainingArguments, FDKTSLM, FDKTLLM logger = logging.getLogger(__name__) AUG_DATA_SAVED_PATH_SUFFIX = "aug_data.pkl" DP_MODEL_SAVED_PATH_SUFFIX = "dp_model" class FDKTRunner(DefaultRunner): def __init__( self, algo: str = "fdkt", inference_inst_conf: Optional[Dict] = None, model_conf: Optional[Dict] = None, embedding_model_conf: Optional[Dict] = None, optimizer_conf: Optional[Dict] = None, training_args_conf: Optional[Dict] = None, dataset_conf: Optional[Dict] = None, data_collator_conf: Optional[Dict] = None, tokenizer_conf: Optional[Dict] = None, task_type: Literal["causal_lm", "others"] = "causal_lm", save_dp_model: bool = False, ) -> None: super(FDKTRunner, self).__init__() self.algo = algo self.inference_inst_conf = inference_inst_conf self.model_conf = model_conf self.embedding_model_conf = embedding_model_conf self.optimizer_conf = optimizer_conf self.training_args_conf = training_args_conf self.dataset_conf = dataset_conf self.data_collator_conf = data_collator_conf self.tokenizer_conf = tokenizer_conf self.task_type = task_type self.save_dp_model = save_dp_model self.training_args = None # check param if self.algo.lower() != "fdkt": raise ValueError(f"algo should be fdkt") if self.task_type not in ["causal_lm"]: raise ValueError("task_type should be causal_lm") def common_setup(self, saved_model=None, output_dir=None): ctx = self.get_context() if output_dir is None: output_dir = "./" if self.model_conf is not None: model = loader_load_from_conf(self.model_conf) else: model = None resume_path = None if saved_model is not None: model_dict = load_model_dict_from_path(saved_model) model.load_state_dict(model_dict) logger.info(f"loading model dict from {saved_model} to model done") if get_last_checkpoint(saved_model) is not None: resume_path = saved_model logger.info(f"checkpoint detected, resume_path set to {resume_path}") # load tokenizer if import conf provided if self.tokenizer_conf is not None: tokenizer = loader_load_from_conf(self.tokenizer_conf) else: tokenizer = None # args dir_warning(self.training_args_conf) training_args = FDKTTrainingArguments(**self.training_args_conf) # reset to default, saving to arbitrary path is not allowed in # DefaultRunner training_args.output_dir = output_dir training_args.resume_from_checkpoint = resume_path # resume path self.training_args = training_args dataset = loader_load_from_conf(self.dataset_conf) return ctx, model, tokenizer, training_args, dataset def llm_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None): ctx, model, tokenizer, training_args, dataset = self.common_setup( output_dir=output_dir, saved_model=saved_model) if model is not None: model = model.load() inference_inst = None if self.inference_inst_conf is not None: inference_inst = loader_load_from_conf(self.inference_inst_conf) embedding_model = loader_load_from_conf(self.embedding_model_conf) if embedding_model is None: raise ValueError(f"model is None, cannot load model from conf {self.model_conf}") embedding_model = embedding_model.load() trainer = FDKTLLM( ctx=ctx, inference_inst=inference_inst, model=model, embedding_model=embedding_model, training_args=training_args, tokenizer=tokenizer, dataset=dataset, ) return trainer def slm_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None): ctx, model, tokenizer, training_args, dataset = self.common_setup( output_dir=output_dir, saved_model=saved_model) model = model.load() dataset.load(train_set) if self.data_collator_conf is not None: data_collator = loader_load_from_conf(self.data_collator_conf) else: data_collator = None optimizer_loader = Loader.from_dict(self.optimizer_conf) optimizer_ = optimizer_loader.load_item() optimizer_params = optimizer_loader.kwargs optimizer = optimizer_(model.parameters(), **optimizer_params) trainer = FDKTSLM( ctx=ctx, model=model, training_args=training_args, tokenizer=tokenizer, train_set=dataset, data_collator=data_collator, optimizer=optimizer, ) return trainer def train( self, train_data: Optional[Union[str, DataFrame]] = None, validate_data: Optional[Union[str, DataFrame]] = None, output_dir: str = None, saved_model_path: str = None, ): if self.is_client(): trainer = self.slm_setup(train_set=train_data, validate_set=validate_data, output_dir=output_dir, saved_model=saved_model_path) aug_data = trainer.aug_data() data_saved_path = output_dir + '/' + AUG_DATA_SAVED_PATH_SUFFIX logger.info('result save to path {}'.format(data_saved_path)) torch.save(aug_data, data_saved_path) if self.save_dp_model: model_save_dir = output_dir + "/" + DP_MODEL_SAVED_PATH_SUFFIX trainer.save_model(model_save_dir) else: trainer = self.llm_setup( train_set=train_data, validate_set=validate_data, output_dir=output_dir, saved_model=saved_model_path ) trainer.aug_data() def predict(self, *args, **kwargs): pass ================================================ FILE: python/fate_llm/runner/fedcot_runner.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch from fate.components.components.nn.nn_runner import ( NNRunner, load_model_dict_from_path, dir_warning, loader_load_from_conf, ) from fate_llm.model_zoo.hf_model import HFAutoModelForCausalLM from fate.components.components.nn.loader import Loader from fate.arch.dataframe import DataFrame from fate.ml.nn.dataset.base import Dataset from typing import Dict from fate_llm.algo.fedcot.fedcot_trainer import FedCoTTrainerClient, FedCoTTraineServer from fate_llm.algo.fedcot.encoder_decoder.slm_encoder_decoder import SLMEncoderDecoderClient, SLMEncoderDecoderServer from fate_llm.algo.inferdpt.init._init import InferInit import torch.nn as nn import torch.optim as optim from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments from typing import Union, Type, Callable, Optional from transformers.trainer_utils import get_last_checkpoint from typing import Literal import logging logger = logging.getLogger(__name__) def _check_instances( model: nn.Module = None, optimizer: optim.Optimizer = None, train_args: Seq2SeqTrainingArguments = None, data_collator: Callable = None, ) -> None: if model is not None and not issubclass(type(model), nn.Module): raise TypeError(f"SetupReturn Error: model must be a subclass of torch.nn.Module but got {type(model)}") if optimizer is not None and not issubclass(type(optimizer), optim.Optimizer): raise TypeError( f"SetupReturn Error: optimizer must be a subclass of torch.optim.Optimizer but got {type(optimizer)}" ) if train_args is not None and not isinstance(train_args, Seq2SeqTrainingArguments): raise TypeError( f"SetupReturn Error: train_args must be an instance of Seq2SeqTrainingArguments " f"but got {type(train_args)}" ) if data_collator is not None and not callable(data_collator): raise TypeError(f"SetupReturn Error: data_collator must be callable but got {type(data_collator)}") class FedCoTRunner(NNRunner): def __init__( self, mode: Literal['train_only', 'infer_only', 'infer_and_train'], model_conf: Optional[Dict] = None, dataset_conf: Optional[Dict] = None, optimizer_conf: Optional[Dict] = None, training_args_conf: Optional[Dict] = None, data_collator_conf: Optional[Dict] = None, tokenizer_conf: Optional[Dict] = None, infer_inst_init_conf: Dict = None, encode_template: str = None, instruction_template: str = None, decode_template: str = None, remote_inference_kwargs: Dict = {}, local_inference_kwargs: Dict = {}, perturb_doc_key: str = 'perturbed_doc', perturbed_response_key: str = 'perturbed_response', result_key: str = 'infer_result', ) -> None: super(NNRunner, self).__init__() self.model_conf = model_conf self.dataset_conf = dataset_conf self.optimizer_conf = optimizer_conf self.training_args_conf = training_args_conf self.data_collator_conf = data_collator_conf self.mode = mode self.tokenizer_conf = tokenizer_conf self.infer_inst_init_conf = infer_inst_init_conf self.encode_template = encode_template self.instruction_template = instruction_template self.decode_template = decode_template self.remote_inference_kwargs = remote_inference_kwargs self.local_inference_kwargs = local_inference_kwargs self.perturb_doc_key = perturb_doc_key self.perturbed_response_key = perturbed_response_key self.result_key = result_key self._temp_data_path = '' # setup var self.trainer = None self.training_args = None def _get_infer_inst(self, init_conf): if init_conf is None: return None loader = Loader.from_dict(init_conf) init_inst = loader.load_item()(self.get_context()) assert isinstance(init_inst, InferInit), 'Need a InferInit class for initialization, but got {}'.format(type(init_inst)) infer_inst = init_inst.get_inst() logger.info('inferdpt inst loaded') return infer_inst def _prepare_data(self, data, data_name): if data is None: return None if isinstance(data, DataFrame) and self.dataset_conf is None: raise RuntimeError('DataFrame format dataset is not supported, please use bind path to load your dataset') else: dataset = loader_load_from_conf(self.dataset_conf) if hasattr(dataset, "load"): logger.info("load path is {}".format(data)) import os if os.path.exists(data) and os.path.isdir(data): self._temp_data_path = data load_output = dataset.load(data) if load_output is not None: dataset = load_output return dataset else: raise RuntimeError('You must offer an existing folder path as data input, but got {}'.format(data)) else: raise ValueError( f"The dataset {dataset} lacks a load() method, which is required for data parsing in the DefaultRunner. \ Please implement this method in your dataset class. You can refer to the base class 'Dataset' in 'fate.ml.nn.dataset.base' \ for the necessary interfaces to implement." ) if dataset is not None and not issubclass(type(dataset), Dataset): raise TypeError( f"SetupReturn Error: {data_name}_set must be a subclass of fate built-in Dataset but got {type(dataset)}, \n" f"You can get the class via: from fate.ml.nn.dataset.table import Dataset" ) return dataset def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"): ctx = self.get_context() model = loader_load_from_conf(self.model_conf) if isinstance(model, HFAutoModelForCausalLM): model = model.load() if model is None: raise ValueError(f"model is None, cannot load model from conf {self.model_conf}") if output_dir is None: output_dir = "./" resume_path = None if saved_model is not None: model_dict = load_model_dict_from_path(saved_model) model.load_state_dict(model_dict) logger.info(f"loading model dict from {saved_model} to model done") if get_last_checkpoint(saved_model) is not None: resume_path = saved_model logger.info(f"checkpoint detected, resume_path set to {resume_path}") # load optimizer if self.optimizer_conf: optimizer_loader = Loader.from_dict(self.optimizer_conf) optimizer_ = optimizer_loader.load_item() optimizer_params = optimizer_loader.kwargs optimizer = optimizer_(model.parameters(), **optimizer_params) else: optimizer = None # load collator func data_collator = loader_load_from_conf(self.data_collator_conf) # load tokenizer if import conf provided tokenizer = loader_load_from_conf(self.tokenizer_conf) # args dir_warning(self.training_args_conf) training_args = Seq2SeqTrainingArguments(**self.training_args_conf) # reset to default, saving to arbitrary path is not allowed in # DefaultRunner training_args.output_dir = output_dir training_args.resume_from_checkpoint = resume_path # resume path self.training_args = training_args if self.training_args.world_size > 0 and self.training_args.local_rank == 0: infer_client = self._get_infer_inst(self.infer_inst_init_conf) else: infer_client = None # only rank 0 need to load the client # prepare trainer trainer = FedCoTTrainerClient( ctx=ctx, training_args=training_args, train_set=train_set, val_set=validate_set, model=model, tokenizer=tokenizer, mode=self.mode, encode_template=self.encode_template, decode_template=self.decode_template, instruction_template=self.instruction_template, local_inference_kwargs=self.local_inference_kwargs, remote_inference_kwargs=self.remote_inference_kwargs, data_collator=data_collator, optimizer=optimizer, infer_client=infer_client, tmp_data_share_path=self._temp_data_path ) return trainer def server_setup(self, stage="train"): trainer = FedCoTTraineServer( ctx=self.get_context(), infer_server=self._get_infer_inst(self.infer_inst_init_conf) ) return trainer def train( self, train_data: Optional[Union[str]] = None, validate_data: Optional[Union[str]] = None, output_dir: str = None, saved_model_path: str = None, ): if self.is_client(): train_set = self._prepare_data(train_data, "train_data") validate_set = self._prepare_data(validate_data, "val_data") trainer = self.client_setup( train_set=train_set, validate_set=validate_set, output_dir=output_dir, saved_model=saved_model_path ) self.trainer = trainer trainer.train() if self.mode == 'infer_only': # save result dataset to the output dir saving_path = output_dir + '/' + 'inference_result.pkl' torch.save(train_set.dataset, saving_path) logger.info('inference result saved to {}'.format(saving_path)) else: if output_dir is not None: if self.training_args.deepspeed and self.training_args.local_rank != 0: pass else: trainer.save_model(output_dir) elif self.is_server(): if self.mode == 'train_only': return else: trainer = self.server_setup() trainer.train() def predict(self, test_data: Union[str], saved_model_path: str = None) -> None: logger.warning('The prediction mode is not supported by this algorithm in the current version. Please perform inference using locally saved models.') return ================================================ FILE: python/fate_llm/runner/fedkseed_runner.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from typing import Dict from typing import Literal from typing import Optional import transformers from fate.components.components.nn.nn_runner import ( NNRunner, dir_warning, loader_load_from_conf, ) from fate.components.components.nn.runner.homo_default_runner import DefaultRunner from fate_llm.algo.fedkseed.fedkseed import Trainer, FedKSeedTrainingArguments, ClientTrainer from fate_llm.algo.fedkseed.zo_utils import build_seed_candidates from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments logger = logging.getLogger(__name__) SUPPORTED_ALGO = ["fedkseed"] class FedKSeedRunner(DefaultRunner): def __init__( self, algo: str = "fedkseed", model_conf: Optional[Dict] = None, dataset_conf: Optional[Dict] = None, optimizer_conf: Optional[Dict] = None, training_args_conf: Optional[Dict] = None, fed_args_conf: Optional[Dict] = None, data_collator_conf: Optional[Dict] = None, tokenizer_conf: Optional[Dict] = None, task_type: Literal["causal_lm", "other"] = "causal_lm", local_mode: bool = False, save_trainable_weights_only: bool = False, ) -> None: super(NNRunner, self).__init__() self.algo = algo self.model_conf = model_conf self.dataset_conf = dataset_conf self.optimizer_conf = optimizer_conf self.training_args_conf = training_args_conf self.fed_args_conf = fed_args_conf self.data_collator_conf = data_collator_conf self.local_mode = local_mode self.tokenizer_conf = tokenizer_conf self.task_type = task_type self.save_trainable_weights_only = save_trainable_weights_only # check param if self.algo not in SUPPORTED_ALGO: raise ValueError(f"algo should be one of {SUPPORTED_ALGO}") if self.task_type not in ["causal_lm", "others"]: raise ValueError("task_type should be one of [binary, multi, regression, others]") assert isinstance(self.local_mode, bool), "local should be bool" # setup var self.trainer = None self.training_args = None def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"): if self.algo != "fedkseed": raise ValueError(f"algo {self.algo} not supported") ctx = self.get_context() model = maybe_loader_load_from_conf(self.model_conf) if model is None: raise ValueError(f"model is None, cannot load model from conf {self.model_conf}") if output_dir is None: output_dir = "./" tokenizer = transformers.AutoTokenizer.from_pretrained(**self.data_collator_conf["kwargs"]["tokenizer_params"]) data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) dir_warning(self.training_args_conf) training_args = Seq2SeqTrainingArguments(**self.training_args_conf) self.training_args = training_args training_args.output_dir = output_dir fedkseed_args = FedKSeedTrainingArguments(**self.fed_args_conf) logger.debug(f"training_args: {training_args}") logger.debug(f"fedkseed_args: {fedkseed_args}") trainer = ClientTrainer( ctx=ctx, model=model, training_args=training_args, fedkseed_args=fedkseed_args, data_collator=data_collator, tokenizer=tokenizer, train_dataset=train_set, eval_dataset=validate_set, ) return trainer def server_setup(self, stage="train"): if self.algo != "fedkseed": raise ValueError(f"algo {self.algo} not supported") ctx = self.get_context() fedkseed_args = FedKSeedTrainingArguments(**self.fed_args_conf) training_args = Seq2SeqTrainingArguments(**self.training_args_conf) seed_candidates = build_seed_candidates(fedkseed_args.k, low=0, high=2 ** 32) trainer = Trainer(ctx=ctx, seed_candidates=seed_candidates, args=training_args, fedkseed_args=fedkseed_args) return trainer def maybe_loader_load_from_conf(conf): from fate_llm.model_zoo.hf_model import HFAutoModelForCausalLM model = loader_load_from_conf(conf) if isinstance(model, HFAutoModelForCausalLM): model = model.load() return model ================================================ FILE: python/fate_llm/runner/fedmkt_runner.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json from fate.components.components.nn.nn_runner import ( load_model_dict_from_path, dir_warning, loader_load_from_conf, run_dataset_func, ) from typing import Dict from fate.components.components.nn.loader import Loader from fate.ml.nn.homo.fedavg import FedAVGArguments from typing import Union, Optional, Literal, List from transformers.trainer_utils import get_last_checkpoint import logging from fate.arch.dataframe import DataFrame from fate.components.components.nn.runner.homo_default_runner import DefaultRunner from fate_llm.algo.fedmkt import FedMKTTrainingArguments, FedMKTSLM, FedMKTLLM logger = logging.getLogger(__name__) class FedMKTRunner(DefaultRunner): def __init__( self, algo: str = "fedmkt", model_conf: Optional[Dict] = None, optimizer_conf: Optional[Dict] = None, training_args_conf: Optional[Dict] = None, fed_args_conf: Optional[Dict] = None, pub_dataset_conf: Optional[Dict] = None, priv_dataset_conf: Optional[Dict] = None, data_collator_conf: Optional[Dict] = None, tokenizer_conf: Optional[Dict] = None, llm_tokenizer_conf: Optional[Dict] = None, slm_tokenizers_conf: List[Optional[Dict]] = None, llm_to_slm_vocab_mapping_path: str = None, slm_to_llm_vocab_mapping_paths: List[str] = None, task_type: Literal["causal_lm", "others"] = "causal_lm", save_trainable_weights_only: bool = False, pub_dataset_path: str = None, ) -> None: super(FedMKTRunner, self).__init__() self.algo = algo self.model_conf = model_conf self.optimizer_conf = optimizer_conf self.training_args_conf = training_args_conf self.fed_args_conf = fed_args_conf self.pub_dataset_conf = pub_dataset_conf self.priv_dataset_conf = priv_dataset_conf self.data_collator_conf = data_collator_conf self.tokenizer_conf = tokenizer_conf self.llm_tokenizer_conf = llm_tokenizer_conf self.slm_tokenizers_conf = slm_tokenizers_conf self.llm_to_slm_vocab_mapping_path = llm_to_slm_vocab_mapping_path self.slm_to_llm_vocab_mapping_paths = slm_to_llm_vocab_mapping_paths self.task_type = task_type self.pub_dataset_path = pub_dataset_path self.save_trainable_weights_only = save_trainable_weights_only self.training_args = None # check param if self.algo.lower() != "fedmkt": raise ValueError(f"algo should be fedmkt") if self.task_type not in ["causal_lm"]: raise ValueError("task_type should be causal_lm") def common_setup(self, saved_model=None, output_dir=None): ctx = self.get_context() if output_dir is None: output_dir = "./" model = loader_load_from_conf(self.model_conf) if model is None: raise ValueError(f"model is None, cannot load model from conf {self.model_conf}") resume_path = None if saved_model is not None: model_dict = load_model_dict_from_path(saved_model) model.load_state_dict(model_dict) logger.info(f"loading model dict from {saved_model} to model done") if get_last_checkpoint(saved_model) is not None: resume_path = saved_model logger.info(f"checkpoint detected, resume_path set to {resume_path}") # load optimizer if self.optimizer_conf: optimizer_loader = Loader.from_dict(self.optimizer_conf) optimizer_ = optimizer_loader.load_item() optimizer_params = optimizer_loader.kwargs optimizer = optimizer_(model.parameters(), **optimizer_params) else: optimizer = None # load tokenizer if import conf provided tokenizer = loader_load_from_conf(self.tokenizer_conf) # args dir_warning(self.training_args_conf) training_args = FedMKTTrainingArguments(**self.training_args_conf) # reset to default, saving to arbitrary path is not allowed in # DefaultRunner training_args.output_dir = output_dir training_args.resume_from_checkpoint = resume_path # resume path self.training_args = training_args if self.fed_args_conf is not None: fed_args = FedAVGArguments(**self.fed_args_conf) else: fed_args = None pub_dataset = loader_load_from_conf(self.pub_dataset_conf) pub_dataset.load(self.pub_dataset_path) return ctx, model, optimizer, tokenizer, training_args, fed_args, pub_dataset def llm_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None): ctx, model, optimizer, tokenizer, training_args, fed_args, pub_dataset = self.common_setup( output_dir=output_dir, saved_model=saved_model) if validate_set is not None: validate_dataset = loader_load_from_conf(self.pub_dataset_conf) validate_dataset.load(validate_set) else: validate_dataset = None slm_tokenizers = None if self.slm_tokenizers_conf: slm_tokenizers = [loader_load_from_conf(tokenizer_conf) for tokenizer_conf in self.slm_tokenizers_conf] slm_to_llm_vocab_mappings = [] for vocab_mapping_path in self.slm_to_llm_vocab_mapping_paths: with open(vocab_mapping_path, "r") as fin: vocab_mapping = json.loads(fin.read()) slm_to_llm_vocab_mappings.append(vocab_mapping) trainer = FedMKTLLM( ctx=ctx, model=model, training_args=training_args, fed_args=fed_args, train_set=pub_dataset, val_set=validate_dataset, tokenizer=tokenizer, slm_tokenizers=slm_tokenizers, slm_to_llm_vocab_mappings=slm_to_llm_vocab_mappings, save_trainable_weights_only=self.save_trainable_weights_only, ) return trainer def slm_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None): ctx, model, optimizer, tokenizer, training_args, fed_args, pub_dataset = self.common_setup( output_dir=output_dir, saved_model=saved_model) priv_dataset = loader_load_from_conf(self.priv_dataset_conf) priv_dataset.load(train_set) if validate_set is not None: validate_dataset = loader_load_from_conf(self.priv_dataset_conf) validate_dataset.load(validate_set) else: validate_dataset = None llm_tokenizer = loader_load_from_conf(self.llm_tokenizer_conf) with open(self.llm_to_slm_vocab_mapping_path, "r") as fin: vocab_mapping = json.loads(fin.read()) priv_data_collator = loader_load_from_conf(self.data_collator_conf) trainer = FedMKTSLM( ctx=ctx, model=model, training_args=training_args, fed_args=fed_args, pub_train_set=pub_dataset, priv_train_set=priv_dataset, val_set=validate_dataset, tokenizer=tokenizer, save_trainable_weights_only=self.save_trainable_weights_only, llm_tokenizer=llm_tokenizer, llm_to_slm_vocab_mapping=vocab_mapping, data_collator=priv_data_collator ) return trainer def train( self, train_data: Optional[Union[str, DataFrame]] = None, validate_data: Optional[Union[str, DataFrame]] = None, output_dir: str = None, saved_model_path: str = None, ): if self.is_client(): trainer = self.slm_setup(train_set=train_data, validate_set=validate_data, output_dir=output_dir, saved_model=saved_model_path) trainer.train() else: trainer = self.llm_setup( train_set=train_data, validate_set=validate_data, output_dir=output_dir, saved_model=saved_model_path ) trainer.train() self.trainer = trainer if self.training_args.deepspeed and self.training_args.local_rank != 0: pass else: trainer.save_model(output_dir) def predict(self, *args, **kwargs): pass ================================================ FILE: python/fate_llm/runner/homo_seq2seq_runner.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from fate.components.components.nn.nn_runner import ( NNRunner, load_model_dict_from_path, dir_warning, loader_load_from_conf, run_dataset_func, ) from fate.components.components.nn.runner.homo_default_runner import DefaultRunner from fate.ml.nn.homo.fedavg import FedAVGArguments from fate_llm.algo.fedavg.fedavg import Seq2SeqFedAVGClient, Seq2SeqFedAVGServer from typing import Dict from fate.components.components.nn.loader import Loader import torch.nn as nn import torch.optim as optim from fate.ml.nn.trainer.trainer_base import FedArguments, HomoTrainerServer from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments, HomoSeq2SeqTrainerClient from typing import Union, Type, Callable, Optional from transformers.trainer_utils import get_last_checkpoint from typing import Literal import logging from fate.arch.dataframe import DataFrame logger = logging.getLogger(__name__) SUPPORTED_ALGO = ["fedavg", "ot"] def _check_instances( trainer: Union[Type[HomoSeq2SeqTrainerClient], Type[HomoTrainerServer]] = None, fed_args: FedArguments = None, model: nn.Module = None, optimizer: optim.Optimizer = None, train_args: Seq2SeqTrainingArguments = None, data_collator: Callable = None, ) -> None: if trainer is not None and not ( issubclass(type(trainer), HomoSeq2SeqTrainerClient) or issubclass(type(trainer), HomoTrainerServer) ): raise TypeError( f"SetupReturn Error: trainer must be a subclass of either " f"HomoSeq2SeqTrainerClient or HomoSeq2SeqTrainerClient but got {type(trainer)}" ) if fed_args is not None and not isinstance(fed_args, FedArguments): raise TypeError(f"SetupReturn Error: fed_args must be an instance of FedArguments but got {type(fed_args)}") if model is not None and not issubclass(type(model), nn.Module): raise TypeError(f"SetupReturn Error: model must be a subclass of torch.nn.Module but got {type(model)}") if optimizer is not None and not issubclass(type(optimizer), optim.Optimizer): raise TypeError( f"SetupReturn Error: optimizer must be a subclass of torch.optim.Optimizer but got {type(optimizer)}" ) if train_args is not None and not isinstance(train_args, Seq2SeqTrainingArguments): raise TypeError( f"SetupReturn Error: train_args must be an instance of Seq2SeqTrainingArguments " f"but got {type(train_args)}" ) if data_collator is not None and not callable(data_collator): raise TypeError(f"SetupReturn Error: data_collator must be callable but got {type(data_collator)}") class Seq2SeqRunner(DefaultRunner): def __init__( self, algo: str = "fedavg", model_conf: Optional[Dict] = None, dataset_conf: Optional[Dict] = None, optimizer_conf: Optional[Dict] = None, training_args_conf: Optional[Dict] = None, fed_args_conf: Optional[Dict] = None, data_collator_conf: Optional[Dict] = None, tokenizer_conf: Optional[Dict] = None, task_type: Literal["causal_lm", "other"] = "causal_lm", local_mode: bool = False, save_trainable_weights_only: bool = False, ) -> None: super(NNRunner, self).__init__() self.algo = algo self.model_conf = model_conf self.dataset_conf = dataset_conf self.optimizer_conf = optimizer_conf self.training_args_conf = training_args_conf self.fed_args_conf = fed_args_conf self.data_collator_conf = data_collator_conf self.local_mode = local_mode self.tokenizer_conf = tokenizer_conf self.task_type = task_type self.save_trainable_weights_only = save_trainable_weights_only # check param if self.algo not in SUPPORTED_ALGO: raise ValueError(f"algo should be one of {SUPPORTED_ALGO}") if self.task_type not in ["causal_lm", "others"]: raise ValueError("task_type should be one of [binary, multi, regression, others]") assert isinstance(self.local_mode, bool), "local should be bool" # setup var self.trainer = None self.training_args = None def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"): if stage == "predict": self.local_mode = True if self.algo == "fedavg": client_class: Seq2SeqFedAVGClient = Seq2SeqFedAVGClient else: raise ValueError(f"algo {self.algo} not supported") ctx = self.get_context() model = loader_load_from_conf(self.model_conf) if model is None: raise ValueError(f"model is None, cannot load model from conf {self.model_conf}") if output_dir is None: output_dir = "./" resume_path = None if saved_model is not None: model_dict = load_model_dict_from_path(saved_model) model.load_state_dict(model_dict) logger.info(f"loading model dict from {saved_model} to model done") if get_last_checkpoint(saved_model) is not None: resume_path = saved_model logger.info(f"checkpoint detected, resume_path set to {resume_path}") # load optimizer if self.optimizer_conf: optimizer_loader = Loader.from_dict(self.optimizer_conf) optimizer_ = optimizer_loader.load_item() optimizer_params = optimizer_loader.kwargs optimizer = optimizer_(model.parameters(), **optimizer_params) else: optimizer = None # load collator func data_collator = loader_load_from_conf(self.data_collator_conf) # load tokenizer if import conf provided tokenizer = loader_load_from_conf(self.tokenizer_conf) # args dir_warning(self.training_args_conf) training_args = Seq2SeqTrainingArguments(**self.training_args_conf) self.training_args = training_args # reset to default, saving to arbitrary path is not allowed in # DefaultRunner training_args.output_dir = output_dir training_args.resume_from_checkpoint = resume_path # resume path fed_args = FedAVGArguments(**self.fed_args_conf) # prepare trainer trainer = client_class( ctx=ctx, model=model, optimizer=optimizer, training_args=training_args, fed_args=fed_args, data_collator=data_collator, tokenizer=tokenizer, train_set=train_set, val_set=validate_set, local_mode=self.local_mode, save_trainable_weights_only=self.save_trainable_weights_only, ) _check_instances( trainer=trainer, model=model, optimizer=optimizer, train_args=training_args, fed_args=fed_args, data_collator=data_collator, ) return trainer def server_setup(self, stage="train"): if stage == "predict": self.local_mode = True if self.algo == "fedavg": server_class: Seq2SeqFedAVGServer = Seq2SeqFedAVGServer else: raise ValueError(f"algo {self.algo} not supported") ctx = self.get_context() trainer = server_class(ctx=ctx, local_mode=self.local_mode) _check_instances(trainer) return trainer def predict(self, test_data: Union[str, DataFrame], saved_model_path: str = None) -> Union[DataFrame, None]: if self.is_client(): test_set = self._prepare_data(test_data, "test_data") if self.trainer is not None: trainer = self.trainer logger.info("trainer found, skip setting up") else: trainer = self.client_setup(saved_model=saved_model_path, stage="predict") classes = run_dataset_func(test_set, "get_classes") match_ids = run_dataset_func(test_set, "get_match_ids") sample_ids = run_dataset_func(test_set, "get_sample_ids") match_id_name = run_dataset_func(test_set, "get_match_id_name") sample_id_name = run_dataset_func(test_set, "get_sample_id_name") if not self.training_args.predict_with_generate: return pred_rs = trainer.predict(test_set) if self.training_args and self.training_args.deepspeed and self.training_args.local_rank != 0: return rs_df = self.get_nn_output_dataframe( self.get_context(), pred_rs.predictions, pred_rs.label_ids if hasattr(pred_rs, "label_ids") else None, match_ids, sample_ids, match_id_name=match_id_name, sample_id_name=sample_id_name, dataframe_format="dist_df", task_type=self.task_type, classes=classes, ) return rs_df else: # server not predict return ================================================ FILE: python/fate_llm/runner/inferdpt_runner.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from fate.components.components.nn.nn_runner import ( NNRunner, load_model_dict_from_path, dir_warning, loader_load_from_conf, run_dataset_func, ) import os from datetime import datetime from fate.components.components.nn.nn_runner import NNRunner from typing import Dict from fate.components.components.nn.loader import Loader import torch import torch.nn as nn import torch.optim as optim from typing import Union, Type, Callable, Optional from typing import Literal import logging from fate_llm.algo.inferdpt.inferdpt import InferDPTClient, InferDPTServer from fate_llm.algo.inferdpt.init._init import InferInit from fate.components.components.nn.loader import Loader from fate_llm.dataset.hf_dataset import HuggingfaceDataset, Dataset from fate.arch.dataframe import DataFrame logger = logging.getLogger(__name__) class InferDPTRunner(NNRunner): def __init__( self, inferdpt_init_conf: Dict, encode_template: str = None, instruction_template: str = None, decode_template: str = None, dataset_conf: Optional[Dict] = None, remote_inference_kwargs: Dict = {}, local_inference_kwargs: Dict = {}, perturb_doc_key: str = 'perturbed_doc', perturbed_response_key: str = 'perturbed_response', result_key: str = 'inferdpt_result', ) -> None: self.inferdpt_init_conf = inferdpt_init_conf self.encode_template = encode_template self.instruction_template = instruction_template self.decode_template = decode_template self.dataset_conf = dataset_conf self.remote_inference_kwargs = remote_inference_kwargs self.local_inference_kwargs = local_inference_kwargs self.perturb_doc_key = perturb_doc_key self.perturbed_response_key = perturbed_response_key self.result_key = result_key def _get_inst(self): loader = Loader.from_dict(self.inferdpt_init_conf) init_inst = loader.load_item()(self.get_context()) assert isinstance(init_inst, InferInit), 'Need a InferDPTInit class for initialization, but got {}'.format(type(init_inst)) inferdpt_inst = init_inst.get_inst() logger.info('inferdpt inst loaded') return inferdpt_inst def client_setup(self): client_inst = self._get_inst() assert isinstance(client_inst, InferDPTClient), 'Client need to get an InferDPTClient class to run the algo' return client_inst def server_setup(self): server_inst = self._get_inst() assert isinstance(server_inst, InferDPTServer), 'Server need to get an InferDPTServer class to run the algo' return server_inst def _prepare_data(self, data, data_name): if data is None: return None if isinstance(data, DataFrame) and self.dataset_conf is None: raise ValueError('DataFrame format dataset is not supported, please use bind path to load your dataset') else: dataset = loader_load_from_conf(self.dataset_conf) if hasattr(dataset, "load"): logger.info("load path is {}".format(data)) load_output = dataset.load(data) if load_output is not None: dataset = load_output return dataset else: raise ValueError( f"The dataset {dataset} lacks a load() method, which is required for data parsing in the DefaultRunner. \ Please implement this method in your dataset class. You can refer to the base class 'Dataset' in 'fate.ml.nn.dataset.base' \ for the necessary interfaces to implement." ) if dataset is not None and not issubclass(type(dataset), Dataset): raise TypeError( f"SetupReturn Error: {data_name}_set must be a subclass of fate built-in Dataset but got {type(dataset)}, \n" f"You can get the class via: from fate.ml.nn.dataset.table import Dataset" ) return dataset def train( self, train_data: Optional[Union[str]] = None, validate_data: Optional[Union[str]] = None, output_dir: str = None, saved_model_path: str = None, ) -> None: if self.is_client(): dataset_0 = self._prepare_data(train_data, "train_data") logger.info('dataset loaded') if dataset_0 is None: raise ValueError('You must provide dataset for inference') assert isinstance(dataset_0, HuggingfaceDataset), 'Currently only support HuggingfaceDataset for inference, but got {}'.format(type(dataset_0)) logger.info('initializing inst') client_inst = self.client_setup() pred_rs = client_inst.inference( dataset_0, self.encode_template, self.instruction_template, self.decode_template, \ remote_inference_kwargs=self.remote_inference_kwargs, local_inference_kwargs=self.local_inference_kwargs ) logger.info('predict done') saving_path = output_dir + '/' + 'inference_result.pkl' logger.info('result save to path {}'.format(saving_path)) torch.save(pred_rs, saving_path) elif self.is_server(): server_inst = self.server_setup() server_inst.inference() else: raise ValueError('Unknown role') def predict( self, test_data: Optional[Union[str]] = None, output_dir: str = None, saved_model_path: str = None ): logger.warning('Predicting mode is not supported in this algorithms in current version, please use the train mode to run inferdpt inference.') return ================================================ FILE: python/fate_llm/runner/offsite_tuning_runner.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from fate.components.components.nn.nn_runner import ( load_model_dict_from_path, dir_warning, loader_load_from_conf, ) from fate.ml.nn.homo.fedavg import FedAVGArguments from fate_llm.algo.fedavg.fedavg import Seq2SeqFedAVGServer from typing import Dict from fate.components.components.nn.loader import Loader from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments from typing import Union, Optional from transformers.trainer_utils import get_last_checkpoint from typing import Literal import logging from fate.arch.dataframe import DataFrame from fate_llm.runner.homo_seq2seq_runner import Seq2SeqRunner, _check_instances from fate_llm.algo.offsite_tuning.offsite_tuning import OffsiteTuningTrainerClient, OffsiteTuningTrainerServer logger = logging.getLogger(__name__) SUPPORTED_ALGO = ["fedavg"] class OTRunner(Seq2SeqRunner): def __init__( self, model_conf: Optional[Dict] = None, dataset_conf: Optional[Dict] = None, optimizer_conf: Optional[Dict] = None, training_args_conf: Optional[Dict] = None, fed_args_conf: Optional[Dict] = None, data_collator_conf: Optional[Dict] = None, tokenizer_conf: Optional[Dict] = None, task_type: Literal["causal_lm", "other"] = "causal_lm", save_trainable_weights_only: bool = False, aggregate_model: bool = False, algo: str = 'ot' ) -> None: super(OTRunner, self).__init__( algo, model_conf, dataset_conf, optimizer_conf, training_args_conf, fed_args_conf, data_collator_conf, tokenizer_conf, task_type, local_mode=False ) self.aggregate_model = aggregate_model self.save_trainable_weights_only = save_trainable_weights_only def setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"): if stage == "predict": self.local_mode = True ctx = self.get_context() model = loader_load_from_conf(self.model_conf) if model is None: raise ValueError(f"model is None, cannot load model from conf {self.model_conf}") if output_dir is None: output_dir = "./" resume_path = None if saved_model is not None: model_dict = load_model_dict_from_path(saved_model) model.load_state_dict(model_dict) logger.info(f"loading model dict from {saved_model} to model done") if get_last_checkpoint(saved_model) is not None: resume_path = saved_model logger.info(f"checkpoint detected, resume_path set to {resume_path}") # load optimizer if self.optimizer_conf: optimizer_loader = Loader.from_dict(self.optimizer_conf) optimizer_ = optimizer_loader.load_item() optimizer_params = optimizer_loader.kwargs optimizer = optimizer_(model.parameters(), **optimizer_params) else: optimizer = None # load collator func data_collator = loader_load_from_conf(self.data_collator_conf) # load tokenizer if import conf provided tokenizer = loader_load_from_conf(self.tokenizer_conf) # args dir_warning(self.training_args_conf) training_args = Seq2SeqTrainingArguments(**self.training_args_conf) self.training_args = training_args # reset to default, saving to arbitrary path is not allowed in # DefaultRunner training_args.output_dir = output_dir training_args.resume_from_checkpoint = resume_path # resume path fed_args = FedAVGArguments(**self.fed_args_conf) # prepare trainer if self.is_client(): trainer = OffsiteTuningTrainerClient( ctx=ctx, model=model, optimizer=optimizer, training_args=training_args, fed_args=fed_args, data_collator=data_collator, tokenizer=tokenizer, train_set=train_set, val_set=validate_set, save_trainable_weights_only=self.save_trainable_weights_only, aggregate_model=self.aggregate_model ) elif self.is_server(): trainer = OffsiteTuningTrainerServer( ctx=ctx, model=model, aggregate_model=self.aggregate_model ) _check_instances( trainer=trainer, model=model, optimizer=optimizer, train_args=training_args, fed_args=fed_args, data_collator=data_collator, ) return trainer def server_setup(self, stage="train"): if stage == "predict": self.local_mode = True if self.algo == "fedavg": server_class: Seq2SeqFedAVGServer = Seq2SeqFedAVGServer else: raise ValueError(f"algo {self.algo} not supported") ctx = self.get_context() trainer = server_class(ctx=ctx, local_mode=self.local_mode) _check_instances(trainer) return trainer def train( self, train_data: Optional[Union[str, DataFrame]] = None, validate_data: Optional[Union[str, DataFrame]] = None, output_dir: str = None, saved_model_path: str = None, ): if self.is_client(): train_set = self._prepare_data(train_data, "train_data") validate_set = self._prepare_data(validate_data, "val_data") trainer = self.setup( train_set=train_set, validate_set=validate_set, output_dir=output_dir, saved_model=saved_model_path ) self.trainer = trainer trainer.train() elif self.is_server(): trainer = self.setup( train_set=None, validate_set=None, output_dir=output_dir, saved_model=saved_model_path ) trainer.train() if output_dir is not None: if self.training_args.deepspeed and self.training_args.local_rank != 0: pass else: trainer.save_model(output_dir) ================================================ FILE: python/fate_llm/trainer/__init__.py ================================================ ================================================ FILE: python/fate_llm/trainer/seq2seq_trainer.py ================================================ # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from transformers import Seq2SeqTrainingArguments as _hf_Seq2SeqTrainingArguments, Seq2SeqTrainer from dataclasses import dataclass, field from typing import Optional from fate.ml.nn.trainer.trainer_base import HomoTrainerMixin, FedArguments, get_ith_checkpoint import os import torch import copy from torch import nn from typing import Any, Dict, List, Callable from enum import Enum from fate.arch import Context from torch.optim import Optimizer from torch.utils.data import DataLoader, Dataset from transformers import PreTrainedTokenizer from transformers import Trainer, EvalPrediction from transformers.trainer_utils import has_length from torch.utils.data import _utils from transformers.trainer_callback import TrainerCallback from typing import Optional from dataclasses import dataclass, field from transformers.modeling_utils import unwrap_model TRAINABLE_WEIGHTS_NAME = "adapter_model.bin" @dataclass class _S2STrainingArguments(_hf_Seq2SeqTrainingArguments): # in fate-2.0, we will control the output dir when using pipeline output_dir: str = field(default="./") disable_tqdm: bool = field(default=True) save_strategy: str = field(default="no") logging_strategy: str = field(default="epoch") logging_steps: int = field(default=1) evaluation_strategy: str = field(default="no") logging_dir: str = field(default=None) checkpoint_idx: int = field(default=None) # by default, we use constant learning rate, the same as FATE-1.X lr_scheduler_type: str = field(default="constant") log_level: str = field(default="info") deepspeed: Optional[str] = field(default=None) save_safetensors: bool = field(default=False) use_cpu: bool = field(default=False) def __post_init__(self): self.push_to_hub = False self.hub_model_id = None self.hub_strategy = "every_save" self.hub_token = None self.hub_private_repo = False self.push_to_hub_model_id = None self.push_to_hub_organization = None self.push_to_hub_token = None super().__post_init__() DEFAULT_ARGS = _S2STrainingArguments().to_dict() @dataclass class Seq2SeqTrainingArguments(_S2STrainingArguments): # To simplify the to dict result(to_dict only return non-default args) def to_dict(self): # Call the superclass's to_dict method all_args = super().to_dict() # Get a dict with default values for all fields default_args = copy.deepcopy(DEFAULT_ARGS) # Filter out args that are equal to their default values set_args = {name: value for name, value in all_args.items() if value != default_args.get(name)} return set_args class HomoSeq2SeqTrainerClient(Seq2SeqTrainer, HomoTrainerMixin): def __init__( self, ctx: Context, model: nn.Module, training_args: Seq2SeqTrainingArguments, fed_args: FedArguments, train_set: Dataset, val_set: Dataset = None, optimizer: torch.optim.Optimizer = None, data_collator: Callable = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, tokenizer: Optional[PreTrainedTokenizer] = None, callbacks: Optional[List[TrainerCallback]] = [], compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, local_mode: bool = False, save_trainable_weights_only: bool = False, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, ): # in case you forget to set evaluation_strategy if val_set is not None and training_args.evaluation_strategy == "no": training_args.evaluation_strategy = "epoch" HomoTrainerMixin.__init__( self, ctx=ctx, model=model, optimizer=optimizer, training_args=training_args, fed_args=fed_args, train_set=train_set, val_set=val_set, scheduler=scheduler, callbacks=callbacks, compute_metrics=compute_metrics, local_mode=local_mode, save_trainable_weights_only=save_trainable_weights_only, ) # concat checkpoint path if checkpoint idx is set if self._args.checkpoint_idx is not None: checkpoint_path = self._args.resume_from_checkpoint if checkpoint_path is not None and os.path.exists(checkpoint_path): checkpoint_folder = get_ith_checkpoint(checkpoint_path, self._args.checkpoint_idx) self._args.resume_from_checkpoint = os.path.join(checkpoint_path, checkpoint_folder) Trainer.__init__( self, model=model, args=self._args, train_dataset=train_set, eval_dataset=val_set, data_collator=data_collator, optimizers=(optimizer, scheduler), tokenizer=tokenizer, compute_metrics=self._compute_metrics_warp_func, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) self._add_fate_callback(self.callback_handler) def _save( self, output_dir: Optional[str] = None, state_dict=None ): if not self._save_trainable_weights_only: return super()._save(output_dir, state_dict) else: model = unwrap_model(self.model) if hasattr(model, "save_trainable"): model.save_trainable(output_dir) else: state_dict = { k: p.to("cpu") for k, p in model.named_parameters() if p.requires_grad } torch.save(state_dict, os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME)) ================================================ FILE: python/requirements.txt ================================================ accelerate==0.27.2 deepspeed==0.13.3 peft==0.8.2 sentencepiece==0.2.0 lm_eval==0.4.2 rouge-score==0.1.2 datasets==2.18.0 editdistance torch==2.3.1 transformers==4.37.2 opacus==1.4.1 fastchat Jinja2 sentence-transformers openai ================================================ FILE: python/setup.py ================================================ # -*- coding: utf-8 -*- # # Copyright 2024 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from setuptools import find_packages, setup # Define the packages and modules packages = find_packages(".") package_data = {"": ["*"]} # Define dependencies install_requires = [ "accelerate==0.27.2", "deepspeed==0.13.3", "peft==0.8.2", "sentencepiece==0.2.0", "lm_eval==0.4.2", "rouge-score==0.1.2", "datasets==2.18.0", "editdistance", "torch==2.3.1", "transformers==4.37.2", "opacus==1.4.1", "fastchat", "Jinja2", "sentence-transformers", "openai" ] # Define the entry points for command-line tools entry_points = { "console_scripts": [ "fate_llm = fate_llm.evaluate.scripts.fate_llm_cli:fate_llm_cli" ] } extras_require = { "fate": ["pyfate==2.2.0"], "fate_flow": ["fate_flow==2.2.0"], "fate_client": ["fate_client==2.2.0"] } # Configure and call the setup function setup_kwargs = { "name": "fate_llm", "version": "2.2.0", "description": "Federated Learning for Large Language Models", "long_description": "Federated Learning for Large Language Models (FATE-LLM) provides a framework to train and evaluate large language models in a federated manner.", "long_description_content_type": "text/markdown", "author": "FederatedAI", "author_email": "contact@FedAI.org", "url": "https://fate.fedai.org/", "packages": packages, "install_requires": install_requires, "entry_points": entry_points, "extras_require": extras_require, "python_requires": ">=3.8", "include_package_data": True } setup(**setup_kwargs)